xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s
4
5define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 {
6; SI-LABEL: round_f64:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
9; SI-NEXT:    s_mov_b32 s6, -1
10; SI-NEXT:    s_mov_b32 s5, 0xfffff
11; SI-NEXT:    s_mov_b32 s4, s6
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    s_bfe_u32 s7, s3, 0xb0014
14; SI-NEXT:    s_addk_i32 s7, 0xfc01
15; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
16; SI-NEXT:    s_and_b32 s8, s3, 0x80000000
17; SI-NEXT:    s_andn2_b64 s[4:5], s[2:3], s[4:5]
18; SI-NEXT:    s_cmp_lt_i32 s7, 0
19; SI-NEXT:    s_cselect_b32 s4, 0, s4
20; SI-NEXT:    s_cselect_b32 s5, s8, s5
21; SI-NEXT:    s_cmp_gt_i32 s7, 51
22; SI-NEXT:    s_cselect_b32 s8, s2, s4
23; SI-NEXT:    s_cselect_b32 s9, s3, s5
24; SI-NEXT:    v_mov_b32_e32 v0, s8
25; SI-NEXT:    v_mov_b32_e32 v1, s9
26; SI-NEXT:    v_add_f64 v[0:1], s[2:3], -v[0:1]
27; SI-NEXT:    s_mov_b32 s4, s0
28; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5
29; SI-NEXT:    s_brev_b32 s2, -2
30; SI-NEXT:    s_and_b64 s[10:11], s[10:11], exec
31; SI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
32; SI-NEXT:    v_mov_b32_e32 v0, s0
33; SI-NEXT:    v_mov_b32_e32 v1, s3
34; SI-NEXT:    v_bfi_b32 v1, s2, v0, v1
35; SI-NEXT:    v_mov_b32_e32 v0, 0
36; SI-NEXT:    v_add_f64 v[0:1], s[8:9], v[0:1]
37; SI-NEXT:    s_mov_b32 s7, 0xf000
38; SI-NEXT:    s_mov_b32 s5, s1
39; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
40; SI-NEXT:    s_endpgm
41;
42; CI-LABEL: round_f64:
43; CI:       ; %bb.0:
44; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
45; CI-NEXT:    s_brev_b32 s5, -2
46; CI-NEXT:    s_mov_b32 s7, 0xf000
47; CI-NEXT:    s_mov_b32 s6, -1
48; CI-NEXT:    s_waitcnt lgkmcnt(0)
49; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[2:3]
50; CI-NEXT:    s_mov_b32 s4, s0
51; CI-NEXT:    v_add_f64 v[2:3], s[2:3], -v[0:1]
52; CI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5
53; CI-NEXT:    v_mov_b32_e32 v2, s3
54; CI-NEXT:    s_and_b64 s[2:3], s[8:9], exec
55; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
56; CI-NEXT:    v_mov_b32_e32 v3, s0
57; CI-NEXT:    v_bfi_b32 v3, s5, v3, v2
58; CI-NEXT:    v_mov_b32_e32 v2, 0
59; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
60; CI-NEXT:    s_mov_b32 s5, s1
61; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
62; CI-NEXT:    s_endpgm
63  %result = call double @llvm.round.f64(double %x) #1
64  store double %result, ptr addrspace(1) %out
65  ret void
66}
67
68define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
69; SI-LABEL: v_round_f64:
70; SI:       ; %bb.0:
71; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
72; SI-NEXT:    s_mov_b32 s7, 0xf000
73; SI-NEXT:    s_mov_b32 s6, 0
74; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
75; SI-NEXT:    v_mov_b32_e32 v1, 0
76; SI-NEXT:    s_waitcnt lgkmcnt(0)
77; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
78; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
79; SI-NEXT:    s_movk_i32 s4, 0xfc01
80; SI-NEXT:    s_mov_b32 s2, -1
81; SI-NEXT:    s_mov_b32 s3, 0xfffff
82; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
83; SI-NEXT:    s_waitcnt vmcnt(0)
84; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
85; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
86; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
87; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
88; SI-NEXT:    v_not_b32_e32 v5, v5
89; SI-NEXT:    v_not_b32_e32 v4, v4
90; SI-NEXT:    v_and_b32_e32 v5, v3, v5
91; SI-NEXT:    v_and_b32_e32 v4, v2, v4
92; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v6
93; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
94; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
95; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 51, v6
96; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
97; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v2, vcc
98; SI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
99; SI-NEXT:    s_brev_b32 s2, -2
100; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
101; SI-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
102; SI-NEXT:    v_bfi_b32 v3, s2, v2, v3
103; SI-NEXT:    v_mov_b32_e32 v2, v1
104; SI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
105; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
106; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
107; SI-NEXT:    s_endpgm
108;
109; CI-LABEL: v_round_f64:
110; CI:       ; %bb.0:
111; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
112; CI-NEXT:    s_mov_b32 s7, 0xf000
113; CI-NEXT:    s_mov_b32 s6, 0
114; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
115; CI-NEXT:    v_mov_b32_e32 v1, 0
116; CI-NEXT:    s_waitcnt lgkmcnt(0)
117; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
118; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
119; CI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
120; CI-NEXT:    s_brev_b32 s2, -2
121; CI-NEXT:    s_waitcnt vmcnt(0)
122; CI-NEXT:    v_trunc_f64_e32 v[4:5], v[2:3]
123; CI-NEXT:    v_add_f64 v[6:7], v[2:3], -v[4:5]
124; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
125; CI-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
126; CI-NEXT:    v_bfi_b32 v3, s2, v2, v3
127; CI-NEXT:    v_mov_b32_e32 v2, v1
128; CI-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
129; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
130; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
131; CI-NEXT:    s_endpgm
132  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
133  %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
134  %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
135  %x = load double, ptr addrspace(1) %gep
136  %result = call double @llvm.round.f64(double %x) #1
137  store double %result, ptr addrspace(1) %out.gep
138  ret void
139}
140
141define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) #0 {
142; SI-LABEL: round_v2f64:
143; SI:       ; %bb.0:
144; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
145; SI-NEXT:    s_mov_b32 s2, -1
146; SI-NEXT:    s_mov_b32 s7, 0xfffff
147; SI-NEXT:    s_mov_b32 s6, s2
148; SI-NEXT:    s_waitcnt lgkmcnt(0)
149; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
150; SI-NEXT:    s_add_i32 s12, s0, 0xfffffc01
151; SI-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
152; SI-NEXT:    s_and_b32 s3, s11, 0x80000000
153; SI-NEXT:    s_andn2_b64 s[0:1], s[10:11], s[0:1]
154; SI-NEXT:    s_cmp_lt_i32 s12, 0
155; SI-NEXT:    s_cselect_b32 s0, 0, s0
156; SI-NEXT:    s_cselect_b32 s1, s3, s1
157; SI-NEXT:    s_cmp_gt_i32 s12, 51
158; SI-NEXT:    s_cselect_b32 s12, s10, s0
159; SI-NEXT:    s_cselect_b32 s13, s11, s1
160; SI-NEXT:    v_mov_b32_e32 v0, s12
161; SI-NEXT:    v_mov_b32_e32 v1, s13
162; SI-NEXT:    v_add_f64 v[0:1], s[10:11], -v[0:1]
163; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
164; SI-NEXT:    v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5
165; SI-NEXT:    s_brev_b32 s10, -2
166; SI-NEXT:    s_and_b64 s[4:5], s[14:15], exec
167; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
168; SI-NEXT:    v_mov_b32_e32 v0, s3
169; SI-NEXT:    s_bfe_u32 s3, s9, 0xb0014
170; SI-NEXT:    s_addk_i32 s3, 0xfc01
171; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s3
172; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[4:5]
173; SI-NEXT:    s_and_b32 s6, s9, 0x80000000
174; SI-NEXT:    s_cmp_lt_i32 s3, 0
175; SI-NEXT:    s_cselect_b32 s4, 0, s4
176; SI-NEXT:    s_cselect_b32 s5, s6, s5
177; SI-NEXT:    s_cmp_gt_i32 s3, 51
178; SI-NEXT:    s_cselect_b32 s4, s8, s4
179; SI-NEXT:    s_cselect_b32 s5, s9, s5
180; SI-NEXT:    v_mov_b32_e32 v2, s4
181; SI-NEXT:    v_mov_b32_e32 v3, s5
182; SI-NEXT:    v_add_f64 v[2:3], s[8:9], -v[2:3]
183; SI-NEXT:    v_mov_b32_e32 v1, s11
184; SI-NEXT:    v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5
185; SI-NEXT:    v_bfi_b32 v1, s10, v0, v1
186; SI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
187; SI-NEXT:    v_mov_b32_e32 v0, 0
188; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
189; SI-NEXT:    v_add_f64 v[2:3], s[12:13], v[0:1]
190; SI-NEXT:    v_mov_b32_e32 v1, s3
191; SI-NEXT:    v_mov_b32_e32 v4, s9
192; SI-NEXT:    v_bfi_b32 v1, s10, v1, v4
193; SI-NEXT:    v_add_f64 v[0:1], s[4:5], v[0:1]
194; SI-NEXT:    s_mov_b32 s3, 0xf000
195; SI-NEXT:    s_waitcnt lgkmcnt(0)
196; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
197; SI-NEXT:    s_endpgm
198;
199; CI-LABEL: round_v2f64:
200; CI:       ; %bb.0:
201; CI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
202; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
203; CI-NEXT:    s_brev_b32 s2, -2
204; CI-NEXT:    v_mov_b32_e32 v0, 0
205; CI-NEXT:    s_mov_b32 s3, 0xf000
206; CI-NEXT:    s_waitcnt lgkmcnt(0)
207; CI-NEXT:    v_trunc_f64_e32 v[2:3], s[10:11]
208; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[8:9]
209; CI-NEXT:    v_add_f64 v[4:5], s[10:11], -v[2:3]
210; CI-NEXT:    v_mov_b32_e32 v1, s11
211; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5
212; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[6:7]
213; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
214; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
215; CI-NEXT:    v_mov_b32_e32 v8, s4
216; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5
217; CI-NEXT:    v_bfi_b32 v1, s2, v8, v1
218; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
219; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
220; CI-NEXT:    v_add_f64 v[2:3], v[2:3], v[0:1]
221; CI-NEXT:    v_mov_b32_e32 v1, s4
222; CI-NEXT:    v_mov_b32_e32 v4, s9
223; CI-NEXT:    v_bfi_b32 v1, s2, v1, v4
224; CI-NEXT:    v_add_f64 v[0:1], v[6:7], v[0:1]
225; CI-NEXT:    s_mov_b32 s2, -1
226; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
227; CI-NEXT:    s_endpgm
228  %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
229  store <2 x double> %result, ptr addrspace(1) %out
230  ret void
231}
232
233define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) #0 {
234; SI-LABEL: round_v4f64:
235; SI:       ; %bb.0:
236; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
237; SI-NEXT:    s_mov_b32 s2, -1
238; SI-NEXT:    s_mov_b32 s7, 0xfffff
239; SI-NEXT:    s_mov_b32 s6, s2
240; SI-NEXT:    v_mov_b32_e32 v4, 0
241; SI-NEXT:    s_waitcnt lgkmcnt(0)
242; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
243; SI-NEXT:    s_add_i32 s16, s0, 0xfffffc01
244; SI-NEXT:    s_lshr_b64 s[0:1], s[6:7], s16
245; SI-NEXT:    s_and_b32 s3, s11, 0x80000000
246; SI-NEXT:    s_andn2_b64 s[0:1], s[10:11], s[0:1]
247; SI-NEXT:    s_cmp_lt_i32 s16, 0
248; SI-NEXT:    s_cselect_b32 s0, 0, s0
249; SI-NEXT:    s_cselect_b32 s1, s3, s1
250; SI-NEXT:    s_cmp_gt_i32 s16, 51
251; SI-NEXT:    s_cselect_b32 s16, s10, s0
252; SI-NEXT:    s_cselect_b32 s17, s11, s1
253; SI-NEXT:    v_mov_b32_e32 v0, s16
254; SI-NEXT:    v_mov_b32_e32 v1, s17
255; SI-NEXT:    v_add_f64 v[0:1], s[10:11], -v[0:1]
256; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
257; SI-NEXT:    v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5
258; SI-NEXT:    v_mov_b32_e32 v1, s11
259; SI-NEXT:    s_and_b64 s[4:5], s[18:19], exec
260; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
261; SI-NEXT:    v_mov_b32_e32 v0, s3
262; SI-NEXT:    s_bfe_u32 s3, s9, 0xb0014
263; SI-NEXT:    s_addk_i32 s3, 0xfc01
264; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s3
265; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[4:5]
266; SI-NEXT:    s_and_b32 s10, s9, 0x80000000
267; SI-NEXT:    s_cmp_lt_i32 s3, 0
268; SI-NEXT:    s_cselect_b32 s4, 0, s4
269; SI-NEXT:    s_cselect_b32 s5, s10, s5
270; SI-NEXT:    s_cmp_gt_i32 s3, 51
271; SI-NEXT:    s_brev_b32 s18, -2
272; SI-NEXT:    s_cselect_b32 s4, s8, s4
273; SI-NEXT:    v_bfi_b32 v5, s18, v0, v1
274; SI-NEXT:    s_cselect_b32 s5, s9, s5
275; SI-NEXT:    v_mov_b32_e32 v0, s4
276; SI-NEXT:    v_mov_b32_e32 v1, s5
277; SI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[0:1]
278; SI-NEXT:    v_add_f64 v[2:3], s[16:17], v[4:5]
279; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5
280; SI-NEXT:    v_mov_b32_e32 v6, s9
281; SI-NEXT:    s_and_b64 s[10:11], s[10:11], exec
282; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
283; SI-NEXT:    v_mov_b32_e32 v5, s3
284; SI-NEXT:    s_bfe_u32 s3, s15, 0xb0014
285; SI-NEXT:    s_addk_i32 s3, 0xfc01
286; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], s3
287; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[8:9]
288; SI-NEXT:    s_and_b32 s10, s15, 0x80000000
289; SI-NEXT:    s_cmp_lt_i32 s3, 0
290; SI-NEXT:    s_cselect_b32 s8, 0, s8
291; SI-NEXT:    s_cselect_b32 s9, s10, s9
292; SI-NEXT:    s_cmp_gt_i32 s3, 51
293; SI-NEXT:    s_cselect_b32 s8, s14, s8
294; SI-NEXT:    s_cselect_b32 s9, s15, s9
295; SI-NEXT:    v_mov_b32_e32 v0, s8
296; SI-NEXT:    v_mov_b32_e32 v1, s9
297; SI-NEXT:    v_add_f64 v[0:1], s[14:15], -v[0:1]
298; SI-NEXT:    v_bfi_b32 v5, s18, v5, v6
299; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5
300; SI-NEXT:    v_add_f64 v[0:1], s[4:5], v[4:5]
301; SI-NEXT:    s_and_b64 s[4:5], s[10:11], exec
302; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
303; SI-NEXT:    v_mov_b32_e32 v8, s3
304; SI-NEXT:    s_bfe_u32 s3, s13, 0xb0014
305; SI-NEXT:    s_addk_i32 s3, 0xfc01
306; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s3
307; SI-NEXT:    s_andn2_b64 s[4:5], s[12:13], s[4:5]
308; SI-NEXT:    s_and_b32 s6, s13, 0x80000000
309; SI-NEXT:    s_cmp_lt_i32 s3, 0
310; SI-NEXT:    s_cselect_b32 s4, 0, s4
311; SI-NEXT:    s_cselect_b32 s5, s6, s5
312; SI-NEXT:    s_cmp_gt_i32 s3, 51
313; SI-NEXT:    s_cselect_b32 s5, s13, s5
314; SI-NEXT:    s_cselect_b32 s4, s12, s4
315; SI-NEXT:    v_mov_b32_e32 v6, s5
316; SI-NEXT:    v_mov_b32_e32 v5, s4
317; SI-NEXT:    v_add_f64 v[6:7], s[12:13], -v[5:6]
318; SI-NEXT:    v_mov_b32_e32 v9, s15
319; SI-NEXT:    v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5
320; SI-NEXT:    v_bfi_b32 v5, s18, v8, v9
321; SI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
322; SI-NEXT:    s_cselect_b32 s3, 0x3ff00000, 0
323; SI-NEXT:    v_add_f64 v[6:7], s[8:9], v[4:5]
324; SI-NEXT:    v_mov_b32_e32 v5, s3
325; SI-NEXT:    v_mov_b32_e32 v8, s13
326; SI-NEXT:    v_bfi_b32 v5, s18, v5, v8
327; SI-NEXT:    v_add_f64 v[4:5], s[4:5], v[4:5]
328; SI-NEXT:    s_mov_b32 s3, 0xf000
329; SI-NEXT:    s_waitcnt lgkmcnt(0)
330; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
331; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
332; SI-NEXT:    s_endpgm
333;
334; CI-LABEL: round_v4f64:
335; CI:       ; %bb.0:
336; CI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
337; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
338; CI-NEXT:    s_brev_b32 s2, -2
339; CI-NEXT:    v_mov_b32_e32 v4, 0
340; CI-NEXT:    s_mov_b32 s3, 0xf000
341; CI-NEXT:    s_waitcnt lgkmcnt(0)
342; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
343; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[8:9]
344; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
345; CI-NEXT:    v_mov_b32_e32 v5, s11
346; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5
347; CI-NEXT:    v_add_f64 v[2:3], s[8:9], -v[6:7]
348; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
349; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
350; CI-NEXT:    v_mov_b32_e32 v8, s4
351; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5
352; CI-NEXT:    v_bfi_b32 v5, s2, v8, v5
353; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[14:15]
354; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
355; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[4:5]
356; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
357; CI-NEXT:    v_add_f64 v[0:1], s[14:15], -v[8:9]
358; CI-NEXT:    v_mov_b32_e32 v5, s4
359; CI-NEXT:    v_mov_b32_e32 v10, s9
360; CI-NEXT:    v_bfi_b32 v5, s2, v5, v10
361; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5
362; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[12:13]
363; CI-NEXT:    v_add_f64 v[0:1], v[6:7], v[4:5]
364; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
365; CI-NEXT:    v_add_f64 v[6:7], s[12:13], -v[10:11]
366; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
367; CI-NEXT:    v_mov_b32_e32 v5, s4
368; CI-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5
369; CI-NEXT:    v_mov_b32_e32 v12, s15
370; CI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
371; CI-NEXT:    v_bfi_b32 v5, s2, v5, v12
372; CI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
373; CI-NEXT:    v_add_f64 v[6:7], v[8:9], v[4:5]
374; CI-NEXT:    v_mov_b32_e32 v5, s4
375; CI-NEXT:    v_mov_b32_e32 v8, s13
376; CI-NEXT:    v_bfi_b32 v5, s2, v5, v8
377; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
378; CI-NEXT:    s_mov_b32 s2, -1
379; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
380; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
381; CI-NEXT:    s_endpgm
382  %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
383  store <4 x double> %result, ptr addrspace(1) %out
384  ret void
385}
386
387define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) #0 {
388; SI-LABEL: round_v8f64:
389; SI:       ; %bb.0:
390; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
391; SI-NEXT:    s_mov_b32 s2, -1
392; SI-NEXT:    s_mov_b32 s7, 0xfffff
393; SI-NEXT:    s_mov_b32 s6, s2
394; SI-NEXT:    v_mov_b32_e32 v8, 0
395; SI-NEXT:    s_waitcnt lgkmcnt(0)
396; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
397; SI-NEXT:    s_add_i32 s24, s0, 0xfffffc01
398; SI-NEXT:    s_lshr_b64 s[0:1], s[6:7], s24
399; SI-NEXT:    s_and_b32 s3, s11, 0x80000000
400; SI-NEXT:    s_andn2_b64 s[0:1], s[10:11], s[0:1]
401; SI-NEXT:    s_cmp_lt_i32 s24, 0
402; SI-NEXT:    s_cselect_b32 s0, 0, s0
403; SI-NEXT:    s_cselect_b32 s1, s3, s1
404; SI-NEXT:    s_cmp_gt_i32 s24, 51
405; SI-NEXT:    s_cselect_b32 s24, s10, s0
406; SI-NEXT:    s_cselect_b32 s25, s11, s1
407; SI-NEXT:    v_mov_b32_e32 v0, s24
408; SI-NEXT:    v_mov_b32_e32 v1, s25
409; SI-NEXT:    v_add_f64 v[0:1], s[10:11], -v[0:1]
410; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
411; SI-NEXT:    v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5
412; SI-NEXT:    v_mov_b32_e32 v1, s11
413; SI-NEXT:    s_and_b64 s[4:5], s[26:27], exec
414; SI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
415; SI-NEXT:    v_mov_b32_e32 v0, s4
416; SI-NEXT:    s_bfe_u32 s4, s9, 0xb0014
417; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc01
418; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
419; SI-NEXT:    s_andn2_b64 s[4:5], s[8:9], s[4:5]
420; SI-NEXT:    s_and_b32 s11, s9, 0x80000000
421; SI-NEXT:    s_cmp_lt_i32 s10, 0
422; SI-NEXT:    s_cselect_b32 s4, 0, s4
423; SI-NEXT:    s_cselect_b32 s5, s11, s5
424; SI-NEXT:    s_cmp_gt_i32 s10, 51
425; SI-NEXT:    s_brev_b32 s3, -2
426; SI-NEXT:    s_cselect_b32 s4, s8, s4
427; SI-NEXT:    v_bfi_b32 v9, s3, v0, v1
428; SI-NEXT:    s_cselect_b32 s5, s9, s5
429; SI-NEXT:    v_mov_b32_e32 v0, s4
430; SI-NEXT:    v_mov_b32_e32 v1, s5
431; SI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[0:1]
432; SI-NEXT:    v_mov_b32_e32 v5, s9
433; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5
434; SI-NEXT:    v_add_f64 v[2:3], s[24:25], v[8:9]
435; SI-NEXT:    s_and_b64 s[10:11], s[10:11], exec
436; SI-NEXT:    s_cselect_b32 s8, 0x3ff00000, 0
437; SI-NEXT:    v_mov_b32_e32 v4, s8
438; SI-NEXT:    s_bfe_u32 s8, s15, 0xb0014
439; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
440; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], s10
441; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[8:9]
442; SI-NEXT:    s_and_b32 s11, s15, 0x80000000
443; SI-NEXT:    s_cmp_lt_i32 s10, 0
444; SI-NEXT:    s_cselect_b32 s8, 0, s8
445; SI-NEXT:    s_cselect_b32 s9, s11, s9
446; SI-NEXT:    s_cmp_gt_i32 s10, 51
447; SI-NEXT:    s_cselect_b32 s8, s14, s8
448; SI-NEXT:    s_cselect_b32 s9, s15, s9
449; SI-NEXT:    v_mov_b32_e32 v0, s8
450; SI-NEXT:    v_mov_b32_e32 v1, s9
451; SI-NEXT:    v_add_f64 v[0:1], s[14:15], -v[0:1]
452; SI-NEXT:    v_bfi_b32 v9, s3, v4, v5
453; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5
454; SI-NEXT:    v_add_f64 v[0:1], s[4:5], v[8:9]
455; SI-NEXT:    s_and_b64 s[4:5], s[10:11], exec
456; SI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
457; SI-NEXT:    v_mov_b32_e32 v6, s4
458; SI-NEXT:    s_bfe_u32 s4, s13, 0xb0014
459; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc01
460; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
461; SI-NEXT:    s_andn2_b64 s[4:5], s[12:13], s[4:5]
462; SI-NEXT:    s_and_b32 s11, s13, 0x80000000
463; SI-NEXT:    s_cmp_lt_i32 s10, 0
464; SI-NEXT:    s_cselect_b32 s4, 0, s4
465; SI-NEXT:    s_cselect_b32 s5, s11, s5
466; SI-NEXT:    s_cmp_gt_i32 s10, 51
467; SI-NEXT:    s_cselect_b32 s4, s12, s4
468; SI-NEXT:    s_cselect_b32 s5, s13, s5
469; SI-NEXT:    v_mov_b32_e32 v4, s4
470; SI-NEXT:    v_mov_b32_e32 v5, s5
471; SI-NEXT:    v_add_f64 v[4:5], s[12:13], -v[4:5]
472; SI-NEXT:    v_mov_b32_e32 v7, s15
473; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5
474; SI-NEXT:    v_bfi_b32 v9, s3, v6, v7
475; SI-NEXT:    v_add_f64 v[6:7], s[8:9], v[8:9]
476; SI-NEXT:    s_and_b64 s[8:9], s[10:11], exec
477; SI-NEXT:    s_cselect_b32 s8, 0x3ff00000, 0
478; SI-NEXT:    v_mov_b32_e32 v9, s8
479; SI-NEXT:    s_bfe_u32 s8, s19, 0xb0014
480; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
481; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], s10
482; SI-NEXT:    s_andn2_b64 s[8:9], s[18:19], s[8:9]
483; SI-NEXT:    s_and_b32 s11, s19, 0x80000000
484; SI-NEXT:    s_cmp_lt_i32 s10, 0
485; SI-NEXT:    s_cselect_b32 s8, 0, s8
486; SI-NEXT:    s_cselect_b32 s9, s11, s9
487; SI-NEXT:    s_cmp_gt_i32 s10, 51
488; SI-NEXT:    s_cselect_b32 s8, s18, s8
489; SI-NEXT:    s_cselect_b32 s9, s19, s9
490; SI-NEXT:    v_mov_b32_e32 v4, s8
491; SI-NEXT:    v_mov_b32_e32 v5, s9
492; SI-NEXT:    v_add_f64 v[4:5], s[18:19], -v[4:5]
493; SI-NEXT:    v_mov_b32_e32 v10, s13
494; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5
495; SI-NEXT:    v_bfi_b32 v9, s3, v9, v10
496; SI-NEXT:    v_add_f64 v[4:5], s[4:5], v[8:9]
497; SI-NEXT:    s_and_b64 s[4:5], s[10:11], exec
498; SI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
499; SI-NEXT:    v_mov_b32_e32 v12, s4
500; SI-NEXT:    s_bfe_u32 s4, s17, 0xb0014
501; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc01
502; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
503; SI-NEXT:    s_andn2_b64 s[4:5], s[16:17], s[4:5]
504; SI-NEXT:    s_and_b32 s11, s17, 0x80000000
505; SI-NEXT:    s_cmp_lt_i32 s10, 0
506; SI-NEXT:    s_cselect_b32 s4, 0, s4
507; SI-NEXT:    s_cselect_b32 s5, s11, s5
508; SI-NEXT:    s_cmp_gt_i32 s10, 51
509; SI-NEXT:    s_cselect_b32 s5, s17, s5
510; SI-NEXT:    s_cselect_b32 s4, s16, s4
511; SI-NEXT:    v_mov_b32_e32 v10, s5
512; SI-NEXT:    v_mov_b32_e32 v9, s4
513; SI-NEXT:    v_add_f64 v[10:11], s[16:17], -v[9:10]
514; SI-NEXT:    v_mov_b32_e32 v13, s19
515; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5
516; SI-NEXT:    v_bfi_b32 v9, s3, v12, v13
517; SI-NEXT:    v_add_f64 v[12:13], s[8:9], v[8:9]
518; SI-NEXT:    s_and_b64 s[8:9], s[10:11], exec
519; SI-NEXT:    s_cselect_b32 s8, 0x3ff00000, 0
520; SI-NEXT:    v_mov_b32_e32 v14, s8
521; SI-NEXT:    s_bfe_u32 s8, s23, 0xb0014
522; SI-NEXT:    s_add_i32 s10, s8, 0xfffffc01
523; SI-NEXT:    s_lshr_b64 s[8:9], s[6:7], s10
524; SI-NEXT:    s_andn2_b64 s[8:9], s[22:23], s[8:9]
525; SI-NEXT:    s_and_b32 s11, s23, 0x80000000
526; SI-NEXT:    s_cmp_lt_i32 s10, 0
527; SI-NEXT:    s_cselect_b32 s8, 0, s8
528; SI-NEXT:    s_cselect_b32 s9, s11, s9
529; SI-NEXT:    s_cmp_gt_i32 s10, 51
530; SI-NEXT:    s_cselect_b32 s9, s23, s9
531; SI-NEXT:    s_cselect_b32 s8, s22, s8
532; SI-NEXT:    v_mov_b32_e32 v10, s9
533; SI-NEXT:    v_mov_b32_e32 v9, s8
534; SI-NEXT:    v_add_f64 v[10:11], s[22:23], -v[9:10]
535; SI-NEXT:    v_mov_b32_e32 v15, s17
536; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5
537; SI-NEXT:    v_bfi_b32 v9, s3, v14, v15
538; SI-NEXT:    v_add_f64 v[10:11], s[4:5], v[8:9]
539; SI-NEXT:    s_and_b64 s[4:5], s[10:11], exec
540; SI-NEXT:    s_cselect_b32 s4, 0x3ff00000, 0
541; SI-NEXT:    v_mov_b32_e32 v9, s4
542; SI-NEXT:    s_bfe_u32 s4, s21, 0xb0014
543; SI-NEXT:    s_add_i32 s10, s4, 0xfffffc01
544; SI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s10
545; SI-NEXT:    s_andn2_b64 s[4:5], s[20:21], s[4:5]
546; SI-NEXT:    s_and_b32 s6, s21, 0x80000000
547; SI-NEXT:    s_cmp_lt_i32 s10, 0
548; SI-NEXT:    s_cselect_b32 s4, 0, s4
549; SI-NEXT:    s_cselect_b32 s5, s6, s5
550; SI-NEXT:    s_cmp_gt_i32 s10, 51
551; SI-NEXT:    s_cselect_b32 s5, s21, s5
552; SI-NEXT:    s_cselect_b32 s4, s20, s4
553; SI-NEXT:    v_mov_b32_e32 v15, s5
554; SI-NEXT:    v_mov_b32_e32 v14, s4
555; SI-NEXT:    v_add_f64 v[14:15], s[20:21], -v[14:15]
556; SI-NEXT:    v_mov_b32_e32 v16, s23
557; SI-NEXT:    v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5
558; SI-NEXT:    v_bfi_b32 v9, s3, v9, v16
559; SI-NEXT:    s_and_b64 s[6:7], s[6:7], exec
560; SI-NEXT:    s_cselect_b32 s6, 0x3ff00000, 0
561; SI-NEXT:    v_add_f64 v[16:17], s[8:9], v[8:9]
562; SI-NEXT:    v_mov_b32_e32 v9, s6
563; SI-NEXT:    v_mov_b32_e32 v14, s21
564; SI-NEXT:    v_bfi_b32 v9, s3, v9, v14
565; SI-NEXT:    v_add_f64 v[14:15], s[4:5], v[8:9]
566; SI-NEXT:    s_mov_b32 s3, 0xf000
567; SI-NEXT:    s_waitcnt lgkmcnt(0)
568; SI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
569; SI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
570; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
571; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
572; SI-NEXT:    s_endpgm
573;
574; CI-LABEL: round_v8f64:
575; CI:       ; %bb.0:
576; CI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
577; CI-NEXT:    s_brev_b32 s6, -2
578; CI-NEXT:    v_mov_b32_e32 v4, 0
579; CI-NEXT:    s_waitcnt lgkmcnt(0)
580; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
581; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[8:9]
582; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
583; CI-NEXT:    v_add_f64 v[8:9], s[8:9], -v[6:7]
584; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5
585; CI-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5
586; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
587; CI-NEXT:    s_cselect_b32 s7, 0x3ff00000, 0
588; CI-NEXT:    v_mov_b32_e32 v5, s11
589; CI-NEXT:    s_and_b64 s[0:1], s[2:3], exec
590; CI-NEXT:    v_mov_b32_e32 v2, s7
591; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[14:15]
592; CI-NEXT:    v_bfi_b32 v5, s6, v2, v5
593; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
594; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[4:5]
595; CI-NEXT:    v_mov_b32_e32 v5, s0
596; CI-NEXT:    v_mov_b32_e32 v10, s9
597; CI-NEXT:    v_add_f64 v[0:1], s[14:15], -v[8:9]
598; CI-NEXT:    v_bfi_b32 v5, s6, v5, v10
599; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5
600; CI-NEXT:    v_add_f64 v[0:1], v[6:7], v[4:5]
601; CI-NEXT:    v_trunc_f64_e32 v[6:7], s[12:13]
602; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
603; CI-NEXT:    v_add_f64 v[10:11], s[12:13], -v[6:7]
604; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
605; CI-NEXT:    v_mov_b32_e32 v5, s0
606; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5
607; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[18:19]
608; CI-NEXT:    v_mov_b32_e32 v12, s15
609; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
610; CI-NEXT:    v_bfi_b32 v5, s6, v5, v12
611; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
612; CI-NEXT:    v_add_f64 v[12:13], s[18:19], -v[10:11]
613; CI-NEXT:    v_add_f64 v[8:9], v[8:9], v[4:5]
614; CI-NEXT:    v_mov_b32_e32 v5, s0
615; CI-NEXT:    v_mov_b32_e32 v14, s13
616; CI-NEXT:    v_bfi_b32 v5, s6, v5, v14
617; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
618; CI-NEXT:    v_trunc_f64_e32 v[14:15], s[16:17]
619; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
620; CI-NEXT:    v_add_f64 v[12:13], s[16:17], -v[14:15]
621; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
622; CI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
623; CI-NEXT:    v_mov_b32_e32 v5, s0
624; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[12:13]|, 0.5
625; CI-NEXT:    v_mov_b32_e32 v16, s19
626; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
627; CI-NEXT:    v_bfi_b32 v5, s6, v5, v16
628; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
629; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[22:23]
630; CI-NEXT:    v_add_f64 v[12:13], v[10:11], v[4:5]
631; CI-NEXT:    v_mov_b32_e32 v5, s0
632; CI-NEXT:    v_mov_b32_e32 v10, s17
633; CI-NEXT:    v_bfi_b32 v5, s6, v5, v10
634; CI-NEXT:    v_add_f64 v[18:19], s[22:23], -v[16:17]
635; CI-NEXT:    v_add_f64 v[10:11], v[14:15], v[4:5]
636; CI-NEXT:    v_trunc_f64_e32 v[14:15], s[20:21]
637; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
638; CI-NEXT:    v_add_f64 v[18:19], s[20:21], -v[14:15]
639; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
640; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5
641; CI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
642; CI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
643; CI-NEXT:    v_mov_b32_e32 v5, s2
644; CI-NEXT:    v_mov_b32_e32 v18, s23
645; CI-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
646; CI-NEXT:    v_bfi_b32 v5, s6, v5, v18
647; CI-NEXT:    v_mov_b32_e32 v18, s0
648; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
649; CI-NEXT:    v_mov_b32_e32 v19, s21
650; CI-NEXT:    v_add_f64 v[16:17], v[16:17], v[4:5]
651; CI-NEXT:    v_bfi_b32 v5, s6, v18, v19
652; CI-NEXT:    v_add_f64 v[14:15], v[14:15], v[4:5]
653; CI-NEXT:    s_mov_b32 s3, 0xf000
654; CI-NEXT:    s_mov_b32 s2, -1
655; CI-NEXT:    s_waitcnt lgkmcnt(0)
656; CI-NEXT:    buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48
657; CI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
658; CI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
659; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
660; CI-NEXT:    s_endpgm
661  %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
662  store <8 x double> %result, ptr addrspace(1) %out
663  ret void
664}
665
666declare i32 @llvm.amdgcn.workitem.id.x() #1
667
668declare double @llvm.round.f64(double) #1
669declare <2 x double> @llvm.round.v2f64(<2 x double>) #1
670declare <4 x double> @llvm.round.v4f64(<4 x double>) #1
671declare <8 x double> @llvm.round.v8f64(<8 x double>) #1
672
673attributes #0 = { nounwind }
674attributes #1 = { nounwind readnone }
675