1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3 4declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 5 6declare double @fabs(double) readnone 7declare double @llvm.fabs.f64(double) readnone 8declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone 9declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone 10 11define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 12; SI-LABEL: v_fabs_f64: 13; SI: ; %bb.0: 14; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s10, 0 17; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 18; SI-NEXT: v_mov_b32_e32 v1, 0 19; SI-NEXT: s_mov_b32 s11, s7 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 22; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 23; SI-NEXT: s_mov_b32 s6, -1 24; SI-NEXT: s_mov_b32 s4, s0 25; SI-NEXT: s_mov_b32 s5, s1 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 28; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 29; SI-NEXT: s_endpgm 30 %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 31 %tidext = sext i32 %tid to i64 32 %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext 33 %val = load double, ptr addrspace(1) %gep, align 8 34 %fabs = call double @llvm.fabs.f64(double %val) 35 store double %fabs, ptr addrspace(1) %out 36 ret void 37} 38 39define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) { 40; SI-LABEL: fabs_f64: 41; SI: ; %bb.0: 42; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 43; SI-NEXT: s_mov_b32 s7, 0xf000 44; SI-NEXT: s_waitcnt lgkmcnt(0) 45; SI-NEXT: s_bitset0_b32 s3, 31 46; SI-NEXT: s_mov_b32 s6, -1 47; SI-NEXT: s_mov_b32 s4, s0 48; SI-NEXT: s_mov_b32 s5, s1 49; SI-NEXT: v_mov_b32_e32 v0, s2 50; SI-NEXT: v_mov_b32_e32 v1, s3 51; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 52; SI-NEXT: s_endpgm 53 %fabs = call double @llvm.fabs.f64(double %in) 54 store double %fabs, ptr addrspace(1) %out 55 ret void 56} 57 58define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) { 59; SI-LABEL: fabs_v2f64: 60; SI: ; %bb.0: 61; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 62; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 63; SI-NEXT: s_mov_b32 s7, 0xf000 64; SI-NEXT: s_waitcnt lgkmcnt(0) 65; SI-NEXT: s_bitset0_b32 s3, 31 66; SI-NEXT: s_bitset0_b32 s1, 31 67; SI-NEXT: s_mov_b32 s6, -1 68; SI-NEXT: v_mov_b32_e32 v0, s0 69; SI-NEXT: v_mov_b32_e32 v2, s2 70; SI-NEXT: v_mov_b32_e32 v1, s1 71; SI-NEXT: v_mov_b32_e32 v3, s3 72; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 73; SI-NEXT: s_endpgm 74 %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) 75 store <2 x double> %fabs, ptr addrspace(1) %out 76 ret void 77} 78 79define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) { 80; SI-LABEL: fabs_v4f64: 81; SI: ; %bb.0: 82; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 83; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 84; SI-NEXT: s_mov_b32 s3, 0xf000 85; SI-NEXT: s_mov_b32 s2, -1 86; SI-NEXT: s_waitcnt lgkmcnt(0) 87; SI-NEXT: s_and_b32 s4, s11, 0x7fffffff 88; SI-NEXT: s_and_b32 s5, s15, 0x7fffffff 89; SI-NEXT: s_and_b32 s6, s13, 0x7fffffff 90; SI-NEXT: s_and_b32 s7, s9, 0x7fffffff 91; SI-NEXT: v_mov_b32_e32 v0, s12 92; SI-NEXT: v_mov_b32_e32 v2, s14 93; SI-NEXT: v_mov_b32_e32 v4, s8 94; SI-NEXT: v_mov_b32_e32 v6, s10 95; SI-NEXT: v_mov_b32_e32 v1, s6 96; SI-NEXT: v_mov_b32_e32 v3, s5 97; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 98; SI-NEXT: v_mov_b32_e32 v5, s7 99; SI-NEXT: v_mov_b32_e32 v7, s4 100; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 101; SI-NEXT: s_endpgm 102 %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) 103 store <4 x double> %fabs, ptr addrspace(1) %out 104 ret void 105} 106 107define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { 108; SI-LABEL: fabs_fold_f64: 109; SI: ; %bb.0: 110; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d 111; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x13 112; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 113; SI-NEXT: s_mov_b32 s3, 0xf000 114; SI-NEXT: s_mov_b32 s2, -1 115; SI-NEXT: s_waitcnt lgkmcnt(0) 116; SI-NEXT: v_mov_b32_e32 v0, s6 117; SI-NEXT: v_mov_b32_e32 v1, s7 118; SI-NEXT: v_mul_f64 v[0:1], |s[8:9]|, v[0:1] 119; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 120; SI-NEXT: s_endpgm 121 %fabs = call double @llvm.fabs.f64(double %in0) 122 %fmul = fmul double %fabs, %in1 123 store double %fmul, ptr addrspace(1) %out 124 ret void 125} 126 127define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) { 128; SI-LABEL: fabs_fn_fold_f64: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d 131; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x13 132; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 133; SI-NEXT: s_mov_b32 s3, 0xf000 134; SI-NEXT: s_mov_b32 s2, -1 135; SI-NEXT: s_waitcnt lgkmcnt(0) 136; SI-NEXT: v_mov_b32_e32 v0, s6 137; SI-NEXT: v_mov_b32_e32 v1, s7 138; SI-NEXT: v_mul_f64 v[0:1], |s[8:9]|, v[0:1] 139; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 140; SI-NEXT: s_endpgm 141 %fabs = call double @fabs(double %in0) 142 %fmul = fmul double %fabs, %in1 143 store double %fmul, ptr addrspace(1) %out 144 ret void 145} 146 147define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) { 148; SI-LABEL: fabs_free_f64: 149; SI: ; %bb.0: 150; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 151; SI-NEXT: s_mov_b32 s7, 0xf000 152; SI-NEXT: s_waitcnt lgkmcnt(0) 153; SI-NEXT: s_bitset0_b32 s3, 31 154; SI-NEXT: s_mov_b32 s6, -1 155; SI-NEXT: s_mov_b32 s4, s0 156; SI-NEXT: s_mov_b32 s5, s1 157; SI-NEXT: v_mov_b32_e32 v0, s2 158; SI-NEXT: v_mov_b32_e32 v1, s3 159; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 160; SI-NEXT: s_endpgm 161 %bc= bitcast i64 %in to double 162 %fabs = call double @llvm.fabs.f64(double %bc) 163 store double %fabs, ptr addrspace(1) %out 164 ret void 165} 166 167define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) { 168; SI-LABEL: fabs_fn_free_f64: 169; SI: ; %bb.0: 170; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 171; SI-NEXT: s_mov_b32 s7, 0xf000 172; SI-NEXT: s_waitcnt lgkmcnt(0) 173; SI-NEXT: s_bitset0_b32 s3, 31 174; SI-NEXT: s_mov_b32 s6, -1 175; SI-NEXT: s_mov_b32 s4, s0 176; SI-NEXT: s_mov_b32 s5, s1 177; SI-NEXT: v_mov_b32_e32 v0, s2 178; SI-NEXT: v_mov_b32_e32 v1, s3 179; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 180; SI-NEXT: s_endpgm 181 %bc= bitcast i64 %in to double 182 %fabs = call double @fabs(double %bc) 183 store double %fabs, ptr addrspace(1) %out 184 ret void 185} 186