xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fabs.f64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3
4declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
5
6declare double @fabs(double) readnone
7declare double @llvm.fabs.f64(double) readnone
8declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
9declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
10
11define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
12; SI-LABEL: v_fabs_f64:
13; SI:       ; %bb.0:
14; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
15; SI-NEXT:    s_mov_b32 s7, 0xf000
16; SI-NEXT:    s_mov_b32 s10, 0
17; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
18; SI-NEXT:    v_mov_b32_e32 v1, 0
19; SI-NEXT:    s_mov_b32 s11, s7
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
22; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
23; SI-NEXT:    s_mov_b32 s6, -1
24; SI-NEXT:    s_mov_b32 s4, s0
25; SI-NEXT:    s_mov_b32 s5, s1
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
28; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
29; SI-NEXT:    s_endpgm
30  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
31  %tidext = sext i32 %tid to i64
32  %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext
33  %val = load double, ptr addrspace(1) %gep, align 8
34  %fabs = call double @llvm.fabs.f64(double %val)
35  store double %fabs, ptr addrspace(1) %out
36  ret void
37}
38
39define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) {
40; SI-LABEL: fabs_f64:
41; SI:       ; %bb.0:
42; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
43; SI-NEXT:    s_mov_b32 s7, 0xf000
44; SI-NEXT:    s_waitcnt lgkmcnt(0)
45; SI-NEXT:    s_bitset0_b32 s3, 31
46; SI-NEXT:    s_mov_b32 s6, -1
47; SI-NEXT:    s_mov_b32 s4, s0
48; SI-NEXT:    s_mov_b32 s5, s1
49; SI-NEXT:    v_mov_b32_e32 v0, s2
50; SI-NEXT:    v_mov_b32_e32 v1, s3
51; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
52; SI-NEXT:    s_endpgm
53  %fabs = call double @llvm.fabs.f64(double %in)
54  store double %fabs, ptr addrspace(1) %out
55  ret void
56}
57
58define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
59; SI-LABEL: fabs_v2f64:
60; SI:       ; %bb.0:
61; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
62; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
63; SI-NEXT:    s_mov_b32 s7, 0xf000
64; SI-NEXT:    s_waitcnt lgkmcnt(0)
65; SI-NEXT:    s_bitset0_b32 s3, 31
66; SI-NEXT:    s_bitset0_b32 s1, 31
67; SI-NEXT:    s_mov_b32 s6, -1
68; SI-NEXT:    v_mov_b32_e32 v0, s0
69; SI-NEXT:    v_mov_b32_e32 v2, s2
70; SI-NEXT:    v_mov_b32_e32 v1, s1
71; SI-NEXT:    v_mov_b32_e32 v3, s3
72; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
73; SI-NEXT:    s_endpgm
74  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
75  store <2 x double> %fabs, ptr addrspace(1) %out
76  ret void
77}
78
79define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
80; SI-LABEL: fabs_v4f64:
81; SI:       ; %bb.0:
82; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
83; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
84; SI-NEXT:    s_mov_b32 s3, 0xf000
85; SI-NEXT:    s_mov_b32 s2, -1
86; SI-NEXT:    s_waitcnt lgkmcnt(0)
87; SI-NEXT:    s_and_b32 s4, s11, 0x7fffffff
88; SI-NEXT:    s_and_b32 s5, s15, 0x7fffffff
89; SI-NEXT:    s_and_b32 s6, s13, 0x7fffffff
90; SI-NEXT:    s_and_b32 s7, s9, 0x7fffffff
91; SI-NEXT:    v_mov_b32_e32 v0, s12
92; SI-NEXT:    v_mov_b32_e32 v2, s14
93; SI-NEXT:    v_mov_b32_e32 v4, s8
94; SI-NEXT:    v_mov_b32_e32 v6, s10
95; SI-NEXT:    v_mov_b32_e32 v1, s6
96; SI-NEXT:    v_mov_b32_e32 v3, s5
97; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
98; SI-NEXT:    v_mov_b32_e32 v5, s7
99; SI-NEXT:    v_mov_b32_e32 v7, s4
100; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
101; SI-NEXT:    s_endpgm
102  %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
103  store <4 x double> %fabs, ptr addrspace(1) %out
104  ret void
105}
106
107define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
108; SI-LABEL: fabs_fold_f64:
109; SI:       ; %bb.0:
110; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x1d
111; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x13
112; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
113; SI-NEXT:    s_mov_b32 s3, 0xf000
114; SI-NEXT:    s_mov_b32 s2, -1
115; SI-NEXT:    s_waitcnt lgkmcnt(0)
116; SI-NEXT:    v_mov_b32_e32 v0, s6
117; SI-NEXT:    v_mov_b32_e32 v1, s7
118; SI-NEXT:    v_mul_f64 v[0:1], |s[8:9]|, v[0:1]
119; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
120; SI-NEXT:    s_endpgm
121  %fabs = call double @llvm.fabs.f64(double %in0)
122  %fmul = fmul double %fabs, %in1
123  store double %fmul, ptr addrspace(1) %out
124  ret void
125}
126
127define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
128; SI-LABEL: fabs_fn_fold_f64:
129; SI:       ; %bb.0:
130; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x1d
131; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x13
132; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
133; SI-NEXT:    s_mov_b32 s3, 0xf000
134; SI-NEXT:    s_mov_b32 s2, -1
135; SI-NEXT:    s_waitcnt lgkmcnt(0)
136; SI-NEXT:    v_mov_b32_e32 v0, s6
137; SI-NEXT:    v_mov_b32_e32 v1, s7
138; SI-NEXT:    v_mul_f64 v[0:1], |s[8:9]|, v[0:1]
139; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
140; SI-NEXT:    s_endpgm
141  %fabs = call double @fabs(double %in0)
142  %fmul = fmul double %fabs, %in1
143  store double %fmul, ptr addrspace(1) %out
144  ret void
145}
146
147define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
148; SI-LABEL: fabs_free_f64:
149; SI:       ; %bb.0:
150; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
151; SI-NEXT:    s_mov_b32 s7, 0xf000
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    s_bitset0_b32 s3, 31
154; SI-NEXT:    s_mov_b32 s6, -1
155; SI-NEXT:    s_mov_b32 s4, s0
156; SI-NEXT:    s_mov_b32 s5, s1
157; SI-NEXT:    v_mov_b32_e32 v0, s2
158; SI-NEXT:    v_mov_b32_e32 v1, s3
159; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
160; SI-NEXT:    s_endpgm
161  %bc= bitcast i64 %in to double
162  %fabs = call double @llvm.fabs.f64(double %bc)
163  store double %fabs, ptr addrspace(1) %out
164  ret void
165}
166
167define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) {
168; SI-LABEL: fabs_fn_free_f64:
169; SI:       ; %bb.0:
170; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
171; SI-NEXT:    s_mov_b32 s7, 0xf000
172; SI-NEXT:    s_waitcnt lgkmcnt(0)
173; SI-NEXT:    s_bitset0_b32 s3, 31
174; SI-NEXT:    s_mov_b32 s6, -1
175; SI-NEXT:    s_mov_b32 s4, s0
176; SI-NEXT:    s_mov_b32 s5, s1
177; SI-NEXT:    v_mov_b32_e32 v0, s2
178; SI-NEXT:    v_mov_b32_e32 v1, s3
179; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
180; SI-NEXT:    s_endpgm
181  %bc= bitcast i64 %in to double
182  %fabs = call double @fabs(double %bc)
183  store double %fabs, ptr addrspace(1) %out
184  ret void
185}
186