xref: /llvm-project/llvm/test/CodeGen/AMDGPU/packed-fp32.ll (revision ca955197047ce044dec1e85fd401b1788550602d)
1; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
6
7; GCN-LABEL: {{^}}fadd_v2_vv:
8; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
10define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
11  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
12  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
13  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
14  %add = fadd <2 x float> %load, %load
15  store <2 x float> %add, ptr addrspace(1) %gep, align 8
16  ret void
17}
18
19; GCN-LABEL: {{^}}fadd_v2_vs:
20; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
21; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
22define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
23  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
24  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
25  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
26  %add = fadd <2 x float> %load, %x
27  store <2 x float> %add, ptr addrspace(1) %gep, align 8
28  ret void
29}
30
31; GCN-LABEL: {{^}}fadd_v4_vs:
32; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
33; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
34define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
35  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
36  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
37  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
38  %add = fadd <4 x float> %load, %x
39  store <4 x float> %add, ptr addrspace(1) %gep, align 16
40  ret void
41}
42
43; GCN-LABEL: {{^}}fadd_v32_vs:
44; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
45; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
46define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
47  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
48  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
49  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
50  %add = fadd <32 x float> %load, %x
51  store <32 x float> %add, ptr addrspace(1) %gep, align 128
52  ret void
53}
54
55; FIXME: GISel does not use op_sel for splat constants.
56
57; GCN-LABEL: {{^}}fadd_v2_v_imm:
58; PACKED:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
59; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
60; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
61; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
62define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
63  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
64  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
65  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
66  %add = fadd <2 x float> %load, <float 100.0, float 100.0>
67  store <2 x float> %add, ptr addrspace(1) %gep, align 8
68  ret void
69}
70
71; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
72; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
73; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
74; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
75define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
76  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
77  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
78  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
79  %fid = bitcast i32 %id to float
80  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
81  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
82  %add = fadd <2 x float> %load, %k
83  store <2 x float> %add, ptr addrspace(1) %gep, align 8
84  ret void
85}
86
87; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
88; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
89; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
90; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
91define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
92  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
93  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
94  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
95  %add = fadd <2 x float> %load, <float 1.0, float 1.0>
96  store <2 x float> %add, ptr addrspace(1) %gep, align 8
97  ret void
98}
99
100; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
101; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
102; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
103; PACKED-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
104; PACKED:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
105define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
106  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
107  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
108  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
109  %add = fadd <2 x float> %load, <float 1.0, float 0.0>
110  store <2 x float> %add, ptr addrspace(1) %gep, align 8
111  ret void
112}
113
114; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0:
115; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
116; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
117; PACKED-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
118; PACKED-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
119; PACKED:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}}
120define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
121  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
122  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
123  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
124  %add = fadd <2 x float> %load, <float 0.0, float 1.0>
125  store <2 x float> %add, ptr addrspace(1) %gep, align 8
126  ret void
127}
128
129; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit:
130; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
131; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
132; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 1.0
133; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 2.0
134; PACKED:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
135define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
136  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
137  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
138  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
139  %add = fadd <2 x float> %load, <float 1.0, float 2.0>
140  store <2 x float> %add, ptr addrspace(1) %gep, align 8
141  ret void
142}
143
144; FIXME: Fold fneg into v_pk_add_f32 with Global ISel.
145
146; GCN-LABEL: {{^}}fadd_v2_v_fneg:
147; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
148; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
149; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
150define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
151  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
152  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
153  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
154  %fneg = fsub float -0.0, %x
155  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
156  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
157  %add = fadd <2 x float> %load, %k
158  store <2 x float> %add, ptr addrspace(1) %gep, align 8
159  ret void
160}
161
162; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo:
163; GFX900-DAG:   v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
164; GFX900-DAG:   v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
165; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}}
166; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
167define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
168  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
169  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
170  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
171  %fneg = fsub float -0.0, %x
172  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
173  %k = insertelement <2 x float> %tmp1, float %x, i64 1
174  %add = fadd <2 x float> %load, %k
175  store <2 x float> %add, ptr addrspace(1) %gep, align 8
176  ret void
177}
178
179; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi:
180; GFX900-DAG:   v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
181; GFX900-DAG:   v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
182; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
183; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
184define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
185  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
186  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
187  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
188  %fneg = fsub float -0.0, %x
189  %tmp1 = insertelement <2 x float> undef, float %x, i64 0
190  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
191  %add = fadd <2 x float> %load, %k
192  store <2 x float> %add, ptr addrspace(1) %gep, align 8
193  ret void
194}
195
196; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2:
197; GFX900-DAG:   v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
198; GFX900-DAG:   v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
199; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}}
200; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
201define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) {
202  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
203  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
204  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
205  %fneg = fsub float -0.0, %x
206  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
207  %k = insertelement <2 x float> %tmp1, float %y, i64 1
208  %add = fadd <2 x float> %load, %k
209  store <2 x float> %add, ptr addrspace(1) %gep, align 8
210  ret void
211}
212
213; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2:
214; GFX900-DAG:   v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
215; GFX900-DAG:   v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
216; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
217; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
218define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) {
219  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
220  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
221  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
222  %fneg = fsub float -0.0, %x
223  %tmp1 = insertelement <2 x float> undef, float %y, i64 0
224  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
225  %add = fadd <2 x float> %load, %k
226  store <2 x float> %add, ptr addrspace(1) %gep, align 8
227  ret void
228}
229
230; GCN-LABEL: {{^}}fmul_v2_vv:
231; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
232; PACKED:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
233define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
234  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
235  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
236  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
237  %mul = fmul <2 x float> %load, %load
238  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
239  ret void
240}
241
242; GCN-LABEL: {{^}}fmul_v2_vs:
243; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
244; PACKED:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
245define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
246  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
247  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
248  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
249  %mul = fmul <2 x float> %load, %x
250  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
251  ret void
252}
253
254; GCN-LABEL: {{^}}fmul_v4_vs:
255; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
256; PACKED-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
257define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
258  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
259  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
260  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
261  %mul = fmul <4 x float> %load, %x
262  store <4 x float> %mul, ptr addrspace(1) %gep, align 16
263  ret void
264}
265
266; GCN-LABEL: {{^}}fmul_v32_vs:
267; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
268; PACKED-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
269define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
270  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
271  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
272  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
273  %mul = fmul <32 x float> %load, %x
274  store <32 x float> %mul, ptr addrspace(1) %gep, align 128
275  ret void
276}
277
278; GCN-LABEL: {{^}}fmul_v2_v_imm:
279; PACKED:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
280; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
281; PACKED-SDAG:    v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
282; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
283define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
284  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
285  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
286  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
287  %mul = fmul <2 x float> %load, <float 100.0, float 100.0>
288  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
289  ret void
290}
291
292; GCN-LABEL: {{^}}fmul_v2_v_v_splat:
293; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
294; PACKED-SDAG:    v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
295; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
296define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
297  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
298  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
299  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
300  %fid = bitcast i32 %id to float
301  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
302  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
303  %mul = fmul <2 x float> %load, %k
304  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
305  ret void
306}
307
308; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
309; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
310; PACKED-SDAG:    v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
311; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
312define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
313  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
314  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
315  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
316  %mul = fmul <2 x float> %load, <float 4.0, float 4.0>
317  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
318  ret void
319}
320
321; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit:
322; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
323; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}}
324; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
325; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
326; PACKED:     v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
327define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
328  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
329  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
330  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
331  %mul = fmul <2 x float> %load, <float 4.0, float 3.0>
332  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
333  ret void
334}
335
336; GCN-LABEL: {{^}}fmul_v2_v_fneg:
337; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}
338; PACKED-SDAG:    v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
339; PACKED-GISEL:   v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
340define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
341  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
342  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
343  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
344  %fneg = fsub float -0.0, %x
345  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
346  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
347  %mul = fmul <2 x float> %load, %k
348  store <2 x float> %mul, ptr addrspace(1) %gep, align 8
349  ret void
350}
351
352; GCN-LABEL: {{^}}fma_v2_vv:
353; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
354; PACKED:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
355define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
356  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
357  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
358  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
359  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load)
360  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
361  ret void
362}
363
364; GCN-LABEL: {{^}}fma_v2_vs:
365; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
366; PACKED:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
367define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
368  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
369  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
370  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
371  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x)
372  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
373  ret void
374}
375
376; GCN-LABEL: {{^}}fma_v4_vs:
377; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
378; PACKED-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
379define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
380  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
381  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
382  %load = load <4 x float>, ptr addrspace(1) %gep, align 16
383  %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x)
384  store <4 x float> %fma, ptr addrspace(1) %gep, align 16
385  ret void
386}
387
388; GCN-LABEL: {{^}}fma_v32_vs:
389; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
390; PACKED-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
391define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
392  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
393  %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
394  %load = load <32 x float>, ptr addrspace(1) %gep, align 128
395  %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x)
396  store <32 x float> %fma, ptr addrspace(1) %gep, align 128
397  ret void
398}
399
400; GCN-LABEL: {{^}}fma_v2_v_imm:
401; GCN-DAG:         s_mov_b32 s[[K1:[0-9]+]], 0x42c80000
402; GFX900-DAG:      v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
403; PACKED-SDAG-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
404; GFX900-COUNT-2:  v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
405; PACKED-SDAG:     v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
406; PACKED-GISEL:    v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
407define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
408  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
409  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
410  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
411  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>)
412  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
413  ret void
414}
415
416; GCN-LABEL: {{^}}fma_v2_v_v_splat:
417; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0
418; PACKED-SDAG:    v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}}
419; PACKED-GISEL:   v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1]{{$}}
420define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
421  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
422  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
423  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
424  %fid = bitcast i32 %id to float
425  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
426  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
427  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
428  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
429  ret void
430}
431
432; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
433; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
434; PACKED-SDAG:    v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
435; PACKED-GISEL:   v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
436define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
437  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
438  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
439  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
440  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>)
441  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
442  ret void
443}
444
445; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit:
446; GCN-DAG:         s_mov_b32 s{{[0-9]+}}, 0x40400000
447; GFX900-DAG:      v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
448; GFX900-DAG:      v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0
449; PACKED-SDAG-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
450; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
451; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
452; PACKED:          v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
453define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
454  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
455  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
456  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
457  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>)
458  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
459  ret void
460}
461
462; GCN-LABEL: {{^}}fma_v2_v_fneg:
463; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
464; PACKED-SDAG:    v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}}
465; PACKED-GISEL:   v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
466define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
467  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
468  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
469  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
470  %fneg = fsub float -0.0, %x
471  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
472  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
473  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
474  store <2 x float> %fma, ptr addrspace(1) %gep, align 8
475  ret void
476}
477
478; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
479; GFX900-COUNT-2: v_sub_f32_e32
480; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
481; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
482define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
483bb:
484  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
485  %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
486  %neg.scalar0 = fsub float -0.0, %scalar0
487
488  %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0
489  %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer
490
491  %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast
492  store <2 x float> %result, ptr addrspace(1) %out, align 4
493  ret void
494}
495
496; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
497; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
498; PACKED-SDAG:    v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
499; PACKED-GISEL:   v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
500define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) {
501bb:
502  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
503  %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2
504
505  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4
506  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 4
507
508  %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4
509  %scalar1 = load volatile float, ptr addrspace(3) %arg2.gep, align 4
510
511  %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0
512  %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1
513  %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2
514
515  %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2)
516  store <2 x float> %result, ptr addrspace(1) %out, align 4
517  ret void
518}
519
520; GCN-LABEL: {{^}}shuffle_add_f32:
521; GFX900-COUNT-2: v_add_f32_e32
522; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}}
523; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
524define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
525bb:
526  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
527  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
528  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
529  %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
530  %result = fadd <2 x float> %vec0, %vec1.swap
531  store <2 x float> %result, ptr addrspace(1) %out, align 8
532  ret void
533}
534
535; GCN-LABEL: {{^}}shuffle_neg_add_f32:
536; GFX900-COUNT-2: v_sub_f32_e32
537; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
538; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
539define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
540bb:
541  %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8
542  %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1
543  %f32 = load volatile float, ptr addrspace(3) undef, align 8
544  %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8
545  %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1
546  %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0>
547  %result = fadd <2 x float> %vec0, %vec1.neg.swap
548  store <2 x float> %result, ptr addrspace(1) %out, align 8
549  ret void
550}
551
552; GCN-LABEL: {{^}}fadd_fadd_fsub_0:
553; GFX900:       v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
554; GFX900:       v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
555
556; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
557; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
558
559; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
560; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
561define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
562bb:
563  %i12 = fadd <2 x float> zeroinitializer, %arg
564  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
565  %i13 = fadd <2 x float> zeroinitializer, %shift8
566  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
567  %i15 = fsub <2 x float> %i14, zeroinitializer
568  store <2 x float> %i15, ptr undef
569  ret void
570}
571
572; GCN-LABEL: {{^}}fadd_fadd_fsub:
573; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
574; GFX900:       v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
575
576; PACKED-SDAG:  v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
577; PACKED-SDAG:  v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
578
579; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
580; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
581define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) {
582bb:
583  %i12 = fadd <2 x float> %arg, %arg1
584  %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
585  %i13 = fadd <2 x float> %arg1, %shift8
586  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
587  %i15 = fsub <2 x float> %i14, %arg1
588  store <2 x float> %i15, ptr addrspace(1) %ptr
589  ret void
590}
591
592; GCN-LABEL: {{^}}fadd_shuffle_v4:
593; GFX900-COUNT-4:       v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
594; PACKED-SDAG-COUNT-2:  v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
595; PACKED-GISEL-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
596define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
597bb:
598  %tid = call i32 @llvm.amdgcn.workitem.id.x()
599  %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid
600  %in.1 = load <4 x float>, ptr addrspace(1) %gep
601  %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer
602  %add.1 = fadd <4 x float> %in.1, %shuf
603  store <4 x float> %add.1, ptr addrspace(1) %gep
604  ret void
605}
606
607; GCN-LABEL: {{^}}fneg_v2f32_vec:
608; GFX900-COUNT-2:       v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
609; PACKED-SDAG:          v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
610; PACKED-GISEL-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
611; PACKED-GISEL:         v_pk_mul_f32 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] op_sel_hi:[0,1]{{$}}
612define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
613  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
614  %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
615  %load = load <2 x float>, ptr addrspace(1) %gep, align 8
616  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load
617  store <2 x float> %fneg, ptr addrspace(1) %gep, align 8
618  ret void
619}
620
621; GCN-LABEL: {{^}}fneg_v2f32_scalar:
622; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
623define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) {
624  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
625  store <2 x float> %fneg, ptr addrspace(1) %a, align 8
626  ret void
627}
628
629declare i32 @llvm.amdgcn.workitem.id.x()
630declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
631declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
632declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
633