xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll (revision e7630a0d60821dc13bb0be4e50b49fba5f90471f)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s
4
5; FIXME: Should still like to vectorize the memory operations for VI
6
7; Simple 3-pair chain with loads and stores
8define amdgpu_kernel void @test1_as_3_3_3_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c) {
9; GCN-LABEL: @test1_as_3_3_3_v2f16(
10; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
11; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
12; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
13; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
14; GCN-NEXT:    ret void
15;
16  %i0 = load half, ptr addrspace(3) %a, align 2
17  %i1 = load half, ptr addrspace(3) %b, align 2
18  %mul = fmul half %i0, %i1
19  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
20  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
21  %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
22  %i4 = load half, ptr addrspace(3) %arrayidx4, align 2
23  %mul5 = fmul half %i3, %i4
24  store half %mul, ptr addrspace(3) %c, align 2
25  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
26  store half %mul5, ptr addrspace(3) %arrayidx5, align 2
27  ret void
28}
29
30define amdgpu_kernel void @test1_as_3_0_0(ptr addrspace(3) %a, ptr %b, ptr %c) {
31; GCN-LABEL: @test1_as_3_0_0(
32; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
33; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
34; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
35; GCN-NEXT:    store <2 x half> [[TMP3]], ptr [[C:%.*]], align 2
36; GCN-NEXT:    ret void
37;
38  %i0 = load half, ptr addrspace(3) %a, align 2
39  %i1 = load half, ptr %b, align 2
40  %mul = fmul half %i0, %i1
41  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
42  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
43  %arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
44  %i4 = load half, ptr %arrayidx4, align 2
45  %mul5 = fmul half %i3, %i4
46  store half %mul, ptr %c, align 2
47  %arrayidx5 = getelementptr inbounds half, ptr %c, i64 1
48  store half %mul5, ptr %arrayidx5, align 2
49  ret void
50}
51
52define amdgpu_kernel void @test1_as_0_0_3_v2f16(ptr %a, ptr %b, ptr addrspace(3) %c) {
53; GCN-LABEL: @test1_as_0_0_3_v2f16(
54; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[A:%.*]], align 2
55; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr [[B:%.*]], align 2
56; GCN-NEXT:    [[TMP3:%.*]] = fmul <2 x half> [[TMP1]], [[TMP2]]
57; GCN-NEXT:    store <2 x half> [[TMP3]], ptr addrspace(3) [[C:%.*]], align 2
58; GCN-NEXT:    ret void
59;
60  %i0 = load half, ptr %a, align 2
61  %i1 = load half, ptr %b, align 2
62  %mul = fmul half %i0, %i1
63  %arrayidx3 = getelementptr inbounds half, ptr %a, i64 1
64  %i3 = load half, ptr %arrayidx3, align 2
65  %arrayidx4 = getelementptr inbounds half, ptr %b, i64 1
66  %i4 = load half, ptr %arrayidx4, align 2
67  %mul5 = fmul half %i3, %i4
68  store half %mul, ptr addrspace(3) %c, align 2
69  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
70  store half %mul5, ptr addrspace(3) %arrayidx5, align 2
71  ret void
72}
73
74define amdgpu_kernel void @test1_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
75; GCN-LABEL: @test1_fma_v2f16(
76; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
77; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
78; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
79; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
80; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[D:%.*]], align 2
81; GCN-NEXT:    ret void
82;
83  %i0 = load half, ptr addrspace(3) %a, align 2
84  %i1 = load half, ptr addrspace(3) %b, align 2
85  %i2 = load half, ptr addrspace(3) %c, align 2
86  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
87  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
88  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
89  %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
90  %i4 = load half, ptr addrspace(3) %arrayidx4, align 2
91  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
92  %i5 = load half, ptr addrspace(3) %arrayidx5, align 2
93  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
94  store half %fma0, ptr addrspace(3) %d, align 2
95  %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
96  store half %fma1, ptr addrspace(3) %arrayidx6, align 2
97  ret void
98}
99
100define amdgpu_kernel void @mul_scalar_v2f16(ptr addrspace(3) %a, half %scalar, ptr addrspace(3) %c) {
101; GCN-LABEL: @mul_scalar_v2f16(
102; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
103; GCN-NEXT:    [[TMP2:%.*]] = insertelement <2 x half> poison, half [[SCALAR:%.*]], i32 0
104; GCN-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <2 x i32> zeroinitializer
105; GCN-NEXT:    [[TMP4:%.*]] = fmul <2 x half> [[TMP1]], [[TMP3]]
106; GCN-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(3) [[C:%.*]], align 2
107; GCN-NEXT:    ret void
108;
109  %i0 = load half, ptr addrspace(3) %a, align 2
110  %mul = fmul half %i0, %scalar
111  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
112  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
113  %mul5 = fmul half %i3, %scalar
114  store half %mul, ptr addrspace(3) %c, align 2
115  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
116  store half %mul5, ptr addrspace(3) %arrayidx5, align 2
117  ret void
118}
119
120define amdgpu_kernel void @fabs_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
121; GCN-LABEL: @fabs_v2f16(
122; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
123; GCN-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
124; GCN-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
125; GCN-NEXT:    ret void
126;
127  %i0 = load half, ptr addrspace(3) %a, align 2
128  %fabs0 = call half @llvm.fabs.f16(half %i0)
129  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
130  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
131  %fabs1 = call half @llvm.fabs.f16(half %i3)
132  store half %fabs0, ptr addrspace(3) %c, align 2
133  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
134  store half %fabs1, ptr addrspace(3) %arrayidx5, align 2
135  ret void
136}
137
138define amdgpu_kernel void @test1_fabs_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
139; GCN-LABEL: @test1_fabs_fma_v2f16(
140; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
141; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[B:%.*]], align 2
142; GCN-NEXT:    [[TMP3:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
143; GCN-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.fabs.v2f16(<2 x half> [[TMP1]])
144; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP4]], <2 x half> [[TMP2]], <2 x half> [[TMP3]])
145; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
146; GCN-NEXT:    ret void
147;
148  %i0 = load half, ptr addrspace(3) %a, align 2
149  %i1 = load half, ptr addrspace(3) %b, align 2
150  %i2 = load half, ptr addrspace(3) %c, align 2
151  %i0.fabs = call half @llvm.fabs.f16(half %i0)
152
153  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
154  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
155  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
156  %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
157  %i4 = load half, ptr addrspace(3) %arrayidx4, align 2
158  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
159  %i5 = load half, ptr addrspace(3) %arrayidx5, align 2
160  %i3.fabs = call half @llvm.fabs.f16(half %i3)
161
162  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
163  store half %fma0, ptr addrspace(3) %d, align 2
164  %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
165  store half %fma1, ptr addrspace(3) %arrayidx6, align 2
166  ret void
167}
168
169define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) {
170; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
171; GCN-NEXT:    [[I1:%.*]] = load half, ptr addrspace(3) [[B:%.*]], align 2
172; GCN-NEXT:    [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]])
173; GCN-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds half, ptr addrspace(3) [[B]], i64 1
174; GCN-NEXT:    [[I4:%.*]] = load half, ptr addrspace(3) [[ARRAYIDX4]], align 2
175; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
176; GCN-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(3) [[C:%.*]], align 2
177; GCN-NEXT:    [[TMP3:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0
178; GCN-NEXT:    [[TMP4:%.*]] = insertelement <2 x half> [[TMP3]], half [[I4]], i32 1
179; GCN-NEXT:    [[TMP5:%.*]] = call <2 x half> @llvm.fma.v2f16(<2 x half> [[TMP1]], <2 x half> [[TMP4]], <2 x half> [[TMP2]])
180; GCN-NEXT:    store <2 x half> [[TMP5]], ptr addrspace(3) [[D:%.*]], align 2
181; GCN-NEXT:    ret void
182;
183  %i0 = load half, ptr addrspace(3) %a, align 2
184  %i1 = load half, ptr addrspace(3) %b, align 2
185  %i2 = load half, ptr addrspace(3) %c, align 2
186  %i1.fabs = call half @llvm.fabs.f16(half %i1)
187
188  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
189  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
190  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
191  %arrayidx4 = getelementptr inbounds half, ptr addrspace(3) %b, i64 1
192  %i4 = load half, ptr addrspace(3) %arrayidx4, align 2
193  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
194  %i5 = load half, ptr addrspace(3) %arrayidx5, align 2
195  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
196  store half %fma0, ptr addrspace(3) %d, align 2
197  %arrayidx6 = getelementptr inbounds half, ptr addrspace(3) %d, i64 1
198  store half %fma1, ptr addrspace(3) %arrayidx6, align 2
199  ret void
200}
201
202define amdgpu_kernel void @canonicalize_v2f16(ptr addrspace(3) %a, ptr addrspace(3) %c) {
203; GCN-LABEL: @canonicalize_v2f16(
204; GCN-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[A:%.*]], align 2
205; GCN-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> [[TMP1]])
206; GCN-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(3) [[C:%.*]], align 2
207; GCN-NEXT:    ret void
208;
209  %i0 = load half, ptr addrspace(3) %a, align 2
210  %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
211  %arrayidx3 = getelementptr inbounds half, ptr addrspace(3) %a, i64 1
212  %i3 = load half, ptr addrspace(3) %arrayidx3, align 2
213  %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
214  store half %canonicalize0, ptr addrspace(3) %c, align 2
215  %arrayidx5 = getelementptr inbounds half, ptr addrspace(3) %c, i64 1
216  store half %canonicalize1, ptr addrspace(3) %arrayidx5, align 2
217  ret void
218}
219
220declare half @llvm.fabs.f16(half) #1
221declare half @llvm.fma.f16(half, half, half) #1
222declare half @llvm.canonicalize.f16(half) #1
223
224attributes #0 = { nounwind }
225attributes #1 = { nounwind readnone }
226;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
227; GFX9: {{.*}}
228; VI: {{.*}}
229