1; RUN: llc -mtriple=amdgcn -mcpu=hawaii -start-before=amdgpu-unify-divergent-exit-nodes -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI %s 2; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI %s 3 4; RUN: llc -mtriple=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI %s 5; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -start-before=amdgpu-unify-divergent-exit-nodes < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI %s 6 7; -------------------------------------------------------------------------------- 8; fadd tests 9; -------------------------------------------------------------------------------- 10 11; GCN-LABEL: {{^}}v_fneg_add_f32: 12; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 13; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 14 15; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 16; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 17 18; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] 19; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 20define amdgpu_kernel void @v_fneg_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 21 %tid = call i32 @llvm.amdgcn.workitem.id.x() 22 %tid.ext = sext i32 %tid to i64 23 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 24 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 25 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 26 %a = load volatile float, ptr addrspace(1) %a.gep 27 %b = load volatile float, ptr addrspace(1) %b.gep 28 %add = fadd float %a, %b 29 %fneg = fneg float %add 30 store float %fneg, ptr addrspace(1) %out.gep 31 ret void 32} 33 34; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: 35; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 36; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 37; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 38; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 39; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 40; GCN-NEXT: s_waitcnt vmcnt(0) 41; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 42; GCN-NEXT: s_waitcnt vmcnt(0) 43define amdgpu_kernel void @v_fneg_add_store_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 44 %tid = call i32 @llvm.amdgcn.workitem.id.x() 45 %tid.ext = sext i32 %tid to i64 46 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 47 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 48 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 49 %a = load volatile float, ptr addrspace(1) %a.gep 50 %b = load volatile float, ptr addrspace(1) %b.gep 51 %add = fadd float %a, %b 52 %fneg = fneg float %add 53 store volatile float %fneg, ptr addrspace(1) %out 54 store volatile float %add, ptr addrspace(1) %out 55 ret void 56} 57 58; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: 59; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 60; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 61 62; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 63; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 64; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] 65 66; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]] 67; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_ADD]] 68 69; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 70; GCN-NEXT: s_waitcnt vmcnt(0) 71; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 72; GCN-NEXT: s_waitcnt vmcnt(0) 73define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %tid.ext = sext i32 %tid to i64 76 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 77 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 78 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 79 %a = load volatile float, ptr addrspace(1) %a.gep 80 %b = load volatile float, ptr addrspace(1) %b.gep 81 %add = fadd float %a, %b 82 %fneg = fneg float %add 83 %use1 = fmul float %add, 4.0 84 store volatile float %fneg, ptr addrspace(1) %out 85 store volatile float %use1, ptr addrspace(1) %out 86 ret void 87} 88 89; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32: 90; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 91; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 92 93; GCN-SAFE: v_sub_f32_e32 94; GCN-SAFE: v_xor_b32_e32 [[ADD:v[0-9]+]], 0x80000000, 95 96; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 97 98; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 99define amdgpu_kernel void @v_fneg_add_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 100 %tid = call i32 @llvm.amdgcn.workitem.id.x() 101 %tid.ext = sext i32 %tid to i64 102 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 103 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 104 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 105 %a = load volatile float, ptr addrspace(1) %a.gep 106 %b = load volatile float, ptr addrspace(1) %b.gep 107 %fneg.a = fneg float %a 108 %add = fadd float %fneg.a, %b 109 %fneg = fneg float %add 110 store volatile float %fneg, ptr addrspace(1) %out 111 ret void 112} 113 114; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32: 115; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 116; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 117 118; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 119; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 120 121; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 122; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 123define amdgpu_kernel void @v_fneg_add_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 124 %tid = call i32 @llvm.amdgcn.workitem.id.x() 125 %tid.ext = sext i32 %tid to i64 126 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 127 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 128 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 129 %a = load volatile float, ptr addrspace(1) %a.gep 130 %b = load volatile float, ptr addrspace(1) %b.gep 131 %fneg.b = fneg float %b 132 %add = fadd float %a, %fneg.b 133 %fneg = fneg float %add 134 store volatile float %fneg, ptr addrspace(1) %out 135 ret void 136} 137 138; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32: 139; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 140; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 141 142; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]] 143; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 144 145; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 146; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 147define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 148 %tid = call i32 @llvm.amdgcn.workitem.id.x() 149 %tid.ext = sext i32 %tid to i64 150 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 151 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 152 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 153 %a = load volatile float, ptr addrspace(1) %a.gep 154 %b = load volatile float, ptr addrspace(1) %b.gep 155 %fneg.a = fneg float %a 156 %fneg.b = fneg float %b 157 %add = fadd float %fneg.a, %fneg.b 158 %fneg = fneg float %add 159 store volatile float %fneg, ptr addrspace(1) %out 160 ret void 161} 162 163; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: 164; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 165; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 166 167; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 168; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 169; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] 170 171; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 172; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 173; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 174; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 175; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 176; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 177define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 178 %tid = call i32 @llvm.amdgcn.workitem.id.x() 179 %tid.ext = sext i32 %tid to i64 180 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 181 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 182 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 183 %a = load volatile float, ptr addrspace(1) %a.gep 184 %b = load volatile float, ptr addrspace(1) %b.gep 185 %fneg.a = fneg float %a 186 %add = fadd float %fneg.a, %b 187 %fneg = fneg float %add 188 store volatile float %fneg, ptr addrspace(1) %out 189 store volatile float %fneg.a, ptr addrspace(1) %out 190 ret void 191} 192 193; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32: 194; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 195; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 196 197; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 198; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] 199; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] 200 201; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]] 202; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 203; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_ADD]] 204; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 205; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 206; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 207define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 { 208 %tid = call i32 @llvm.amdgcn.workitem.id.x() 209 %tid.ext = sext i32 %tid to i64 210 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 211 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 212 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 213 %a = load volatile float, ptr addrspace(1) %a.gep 214 %b = load volatile float, ptr addrspace(1) %b.gep 215 %fneg.a = fneg float %a 216 %add = fadd float %fneg.a, %b 217 %fneg = fneg float %add 218 %use1 = fmul float %fneg.a, %c 219 store volatile float %fneg, ptr addrspace(1) %out 220 store volatile float %use1, ptr addrspace(1) %out 221 ret void 222} 223 224; This one asserted with -enable-no-signed-zeros-fp-math 225; GCN-LABEL: {{^}}fneg_fadd_0: 226; GCN-SAFE-DAG: v_mad_f32 [[A:v[0-9]+]], 227; GCN-SAFE-DAG: v_cmp_ngt_f32_e32 {{.*}}, [[A]] 228; GCN-SAFE-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, -[[A]] 229 230; GCN-NSZ-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0, v 231; GCN-NSZ: v_cmp_ngt_f32 232; GCN-NSZ: v_cndmask_b32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} 233define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { 234.entry: 235 %tmp7 = fdiv float 1.000000e+00, %tmp6 236 %tmp8 = fmul float 0.000000e+00, %tmp7 237 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 238 %.i188 = fadd float %tmp9, 0.000000e+00 239 %tmp10 = fcmp uge float %.i188, %tmp2 240 %tmp11 = fneg float %.i188 241 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 242 %tmp12 = fcmp ule float %.i092, 0.000000e+00 243 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 244 ret float %.i198 245} 246 247; This is a workaround because -enable-no-signed-zeros-fp-math does not set up 248; function attribute unsafe-fp-math automatically. Combine with the previous test 249; when that is done. 250; GCN-LABEL: {{^}}fneg_fadd_0_nsz: 251; GCN-NSZ-DAG: v_rcp_f32_e32 [[A:v[0-9]+]], 252; GCN-NSZ-DAG: v_mov_b32_e32 [[B:v[0-9]+]], 253; GCN-NSZ-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fc00000 254; GCN-NSZ-DAG: v_mul_f32_e32 [[D:v[0-9]+]], 0, [[A]] 255; GCN-NSZ-DAG: v_cmp_ngt_f32_e32 {{.*}}, s{{[0-9]+}}, [[D]] 256; GCN-NSZ-DAG: v_cndmask_b32_e64 [[E:v[0-9]+]], -[[D]], v{{[0-9]+}}, 257; GCN-NSZ-DAG: v_cmp_nlt_f32_e32 {{.*}}, 0 258; GCN-NSZ-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, [[C]], 0, 259define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { 260.entry: 261 %tmp7 = fdiv afn float 1.000000e+00, %tmp6 262 %tmp8 = fmul float 0.000000e+00, %tmp7 263 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 264 %.i188 = fadd float %tmp9, 0.000000e+00 265 %tmp10 = fcmp uge float %.i188, %tmp2 266 %tmp11 = fneg float %.i188 267 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 268 %tmp12 = fcmp ule float %.i092, 0.000000e+00 269 %.i198 = select i1 %tmp12, float 0.000000e+00, float 0x7FF8000000000000 270 ret float %.i198 271} 272 273; -------------------------------------------------------------------------------- 274; fmul tests 275; -------------------------------------------------------------------------------- 276 277; GCN-LABEL: {{^}}v_fneg_mul_f32: 278; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 279; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 280; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 281; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 282define amdgpu_kernel void @v_fneg_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 283 %tid = call i32 @llvm.amdgcn.workitem.id.x() 284 %tid.ext = sext i32 %tid to i64 285 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 286 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 287 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 288 %a = load volatile float, ptr addrspace(1) %a.gep 289 %b = load volatile float, ptr addrspace(1) %b.gep 290 %mul = fmul float %a, %b 291 %fneg = fneg float %mul 292 store float %fneg, ptr addrspace(1) %out.gep 293 ret void 294} 295 296; GCN-LABEL: {{^}}v_fneg_mul_store_use_mul_f32: 297; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 298; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 299; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 300; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]] 301; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 302; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 303define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 304 %tid = call i32 @llvm.amdgcn.workitem.id.x() 305 %tid.ext = sext i32 %tid to i64 306 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 307 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 308 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 309 %a = load volatile float, ptr addrspace(1) %a.gep 310 %b = load volatile float, ptr addrspace(1) %b.gep 311 %mul = fmul float %a, %b 312 %fneg = fneg float %mul 313 store volatile float %fneg, ptr addrspace(1) %out 314 store volatile float %mul, ptr addrspace(1) %out 315 ret void 316} 317 318; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32: 319; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 320; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 321; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]] 322; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]] 323 324; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 325; GCN-NEXT: s_waitcnt vmcnt(0) 326; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 327; GCN-NEXT: s_waitcnt vmcnt(0) 328define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 329 %tid = call i32 @llvm.amdgcn.workitem.id.x() 330 %tid.ext = sext i32 %tid to i64 331 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 332 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 333 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 334 %a = load volatile float, ptr addrspace(1) %a.gep 335 %b = load volatile float, ptr addrspace(1) %b.gep 336 %mul = fmul float %a, %b 337 %fneg = fneg float %mul 338 %use1 = fmul float %mul, 4.0 339 store volatile float %fneg, ptr addrspace(1) %out 340 store volatile float %use1, ptr addrspace(1) %out 341 ret void 342} 343 344; GCN-LABEL: {{^}}v_fneg_mul_fneg_x_f32: 345; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 346; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 347; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 348; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 349define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 350 %tid = call i32 @llvm.amdgcn.workitem.id.x() 351 %tid.ext = sext i32 %tid to i64 352 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 353 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 354 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 355 %a = load volatile float, ptr addrspace(1) %a.gep 356 %b = load volatile float, ptr addrspace(1) %b.gep 357 %fneg.a = fneg float %a 358 %mul = fmul float %fneg.a, %b 359 %fneg = fneg float %mul 360 store volatile float %fneg, ptr addrspace(1) %out 361 ret void 362} 363 364; GCN-LABEL: {{^}}v_fneg_mul_x_fneg_f32: 365; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 366; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 367; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 368; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 369define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 370 %tid = call i32 @llvm.amdgcn.workitem.id.x() 371 %tid.ext = sext i32 %tid to i64 372 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 373 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 374 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 375 %a = load volatile float, ptr addrspace(1) %a.gep 376 %b = load volatile float, ptr addrspace(1) %b.gep 377 %fneg.b = fneg float %b 378 %mul = fmul float %a, %fneg.b 379 %fneg = fneg float %mul 380 store volatile float %fneg, ptr addrspace(1) %out 381 ret void 382} 383 384; GCN-LABEL: {{^}}v_fneg_mul_fneg_fneg_f32: 385; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 386; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 387; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 388; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 389define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 390 %tid = call i32 @llvm.amdgcn.workitem.id.x() 391 %tid.ext = sext i32 %tid to i64 392 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 393 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 394 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 395 %a = load volatile float, ptr addrspace(1) %a.gep 396 %b = load volatile float, ptr addrspace(1) %b.gep 397 %fneg.a = fneg float %a 398 %fneg.b = fneg float %b 399 %mul = fmul float %fneg.a, %fneg.b 400 %fneg = fneg float %mul 401 store volatile float %fneg, ptr addrspace(1) %out 402 ret void 403} 404 405; GCN-LABEL: {{^}}v_fneg_mul_store_use_fneg_x_f32: 406; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 407; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 408; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 409; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 410 411; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 412; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 413define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 414 %tid = call i32 @llvm.amdgcn.workitem.id.x() 415 %tid.ext = sext i32 %tid to i64 416 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 417 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 418 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 419 %a = load volatile float, ptr addrspace(1) %a.gep 420 %b = load volatile float, ptr addrspace(1) %b.gep 421 %fneg.a = fneg float %a 422 %mul = fmul float %fneg.a, %b 423 %fneg = fneg float %mul 424 store volatile float %fneg, ptr addrspace(1) %out 425 store volatile float %fneg.a, ptr addrspace(1) %out 426 ret void 427} 428 429; GCN-LABEL: {{^}}v_fneg_mul_multi_use_fneg_x_f32: 430; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 431; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 432; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[A]], [[B]] 433; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 434; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL]] 435; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 436define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 { 437 %tid = call i32 @llvm.amdgcn.workitem.id.x() 438 %tid.ext = sext i32 %tid to i64 439 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 440 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 441 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 442 %a = load volatile float, ptr addrspace(1) %a.gep 443 %b = load volatile float, ptr addrspace(1) %b.gep 444 %fneg.a = fneg float %a 445 %mul = fmul float %fneg.a, %b 446 %fneg = fneg float %mul 447 %use1 = fmul float %fneg.a, %c 448 store volatile float %fneg, ptr addrspace(1) %out 449 store volatile float %use1, ptr addrspace(1) %out 450 ret void 451} 452 453; -------------------------------------------------------------------------------- 454; fminnum tests 455; -------------------------------------------------------------------------------- 456 457; GCN-LABEL: {{^}}v_fneg_minnum_f32_ieee: 458; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 459; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 460; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 461; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 462; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 463; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 464define amdgpu_kernel void @v_fneg_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 465 %tid = call i32 @llvm.amdgcn.workitem.id.x() 466 %tid.ext = sext i32 %tid to i64 467 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 468 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 469 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 470 %a = load volatile float, ptr addrspace(1) %a.gep 471 %b = load volatile float, ptr addrspace(1) %b.gep 472 %min = call float @llvm.minnum.f32(float %a, float %b) 473 %fneg = fneg float %min 474 store float %fneg, ptr addrspace(1) %out.gep 475 ret void 476} 477 478; GCN-LABEL: {{^}}v_fneg_minnum_f32_no_ieee: 479; GCN-NOT: v0 480; GCN-NOT: v1 481; GCN: v_max_f32_e64 v0, -v0, -v1 482; GCN-NEXT: ; return 483define amdgpu_ps float @v_fneg_minnum_f32_no_ieee(float %a, float %b) #0 { 484 %min = call float @llvm.minnum.f32(float %a, float %b) 485 %fneg = fneg float %min 486 ret float %fneg 487} 488 489; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_ieee: 490; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 491; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 492; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 493; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 494define amdgpu_kernel void @v_fneg_self_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 495 %tid = call i32 @llvm.amdgcn.workitem.id.x() 496 %tid.ext = sext i32 %tid to i64 497 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 498 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 499 %a = load volatile float, ptr addrspace(1) %a.gep 500 %min = call float @llvm.minnum.f32(float %a, float %a) 501 %min.fneg = fneg float %min 502 store float %min.fneg, ptr addrspace(1) %out.gep 503 ret void 504} 505 506; GCN-LABEL: {{^}}v_fneg_self_minnum_f32_no_ieee: 507; GCN-NOT: v0 508; GCN: v_max_f32_e64 v0, -v0, -v0 509; GCN-NEXT: ; return 510define amdgpu_ps float @v_fneg_self_minnum_f32_no_ieee(float %a) #0 { 511 %min = call float @llvm.minnum.f32(float %a, float %a) 512 %min.fneg = fneg float %min 513 ret float %min.fneg 514} 515 516; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_ieee: 517; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 518; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 519; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 520; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 521define amdgpu_kernel void @v_fneg_posk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 522 %tid = call i32 @llvm.amdgcn.workitem.id.x() 523 %tid.ext = sext i32 %tid to i64 524 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 525 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 526 %a = load volatile float, ptr addrspace(1) %a.gep 527 %min = call float @llvm.minnum.f32(float 4.0, float %a) 528 %fneg = fneg float %min 529 store float %fneg, ptr addrspace(1) %out.gep 530 ret void 531} 532 533; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32_no_ieee: 534; GCN-NOT: v0 535; GCN: v_max_f32_e64 v0, -v0, -4.0 536; GCN-NEXT: ; return 537define amdgpu_ps float @v_fneg_posk_minnum_f32_no_ieee(float %a) #0 { 538 %min = call float @llvm.minnum.f32(float 4.0, float %a) 539 %fneg = fneg float %min 540 ret float %fneg 541} 542 543; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_ieee: 544; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 545; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 546; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 547; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 548define amdgpu_kernel void @v_fneg_negk_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 549 %tid = call i32 @llvm.amdgcn.workitem.id.x() 550 %tid.ext = sext i32 %tid to i64 551 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 552 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 553 %a = load volatile float, ptr addrspace(1) %a.gep 554 %min = call float @llvm.minnum.f32(float -4.0, float %a) 555 %fneg = fneg float %min 556 store float %fneg, ptr addrspace(1) %out.gep 557 ret void 558} 559 560; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32_no_ieee: 561; GCN-NOT: v0 562; GCN: v_max_f32_e64 v0, -v0, 4.0 563; GCN-NEXT: ; return 564define amdgpu_ps float @v_fneg_negk_minnum_f32_no_ieee(float %a) #0 { 565 %min = call float @llvm.minnum.f32(float -4.0, float %a) 566 %fneg = fneg float %min 567 ret float %fneg 568} 569 570; GCN-LABEL: {{^}}v_fneg_0_minnum_f32: 571; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 572; GCN-NOT: [[A]] 573; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]] 574; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MIN]] 575; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 576define amdgpu_kernel void @v_fneg_0_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 577 %tid = call i32 @llvm.amdgcn.workitem.id.x() 578 %tid.ext = sext i32 %tid to i64 579 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 580 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 581 %a = load volatile float, ptr addrspace(1) %a.gep 582 %min = call nnan float @llvm.minnum.f32(float 0.0, float %a) 583 %fneg = fneg float %min 584 store float %fneg, ptr addrspace(1) %out.gep 585 ret void 586} 587 588; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_ieee: 589; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 590; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 591; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 592; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 593define amdgpu_kernel void @v_fneg_neg0_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 594 %tid = call i32 @llvm.amdgcn.workitem.id.x() 595 %tid.ext = sext i32 %tid to i64 596 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 597 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 598 %a = load volatile float, ptr addrspace(1) %a.gep 599 %min = call float @llvm.minnum.f32(float -0.0, float %a) 600 %fneg = fneg float %min 601 store float %fneg, ptr addrspace(1) %out.gep 602 ret void 603} 604 605; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f32: 606; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 607 608; SI-DAG: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 609; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 610 611; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 612; VI: v_min_f32_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 613; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 614 615; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 616define amdgpu_kernel void @v_fneg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 617 %tid = call i32 @llvm.amdgcn.workitem.id.x() 618 %tid.ext = sext i32 %tid to i64 619 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 620 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 621 %a = load volatile float, ptr addrspace(1) %a.gep 622 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 623 %fneg = fneg float %min 624 store float %fneg, ptr addrspace(1) %out.gep 625 ret void 626} 627 628; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f32: 629; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 630 631; SI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 632; SI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0x3e22f983, [[NEG_QUIET]] 633 634; VI: v_mul_f32_e32 [[NEG_QUIET:v[0-9]+]], -1.0, [[A]] 635; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 636 637; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 638define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 639 %tid = call i32 @llvm.amdgcn.workitem.id.x() 640 %tid.ext = sext i32 %tid to i64 641 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 642 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 643 %a = load volatile float, ptr addrspace(1) %a.gep 644 %min = call float @llvm.minnum.f32(float 0xBFC45F3060000000, float %a) 645 %fneg = fneg float %min 646 store float %fneg, ptr addrspace(1) %out.gep 647 ret void 648} 649 650; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f16: 651; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 652 653; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 654; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0xbe230000, [[CVT]] 655; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 656 657; VI: v_max_f16_e32 [[QUIET:v[0-9]+]], [[A]], [[A]] 658; VI: v_min_f16_e32 [[MAX:v[0-9]+]], 0.15915494, [[QUIET]] 659; VI: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x8000, [[MAX]] 660 661; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 662define amdgpu_kernel void @v_fneg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 663 %tid = call i32 @llvm.amdgcn.workitem.id.x() 664 %tid.ext = sext i32 %tid to i64 665 %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext 666 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 667 %a = load volatile half, ptr addrspace(1) %a.gep 668 %min = call half @llvm.minnum.f16(half 0xH3118, half %a) 669 %fneg = fsub half -0.000000e+00, %min 670 store half %fneg, ptr addrspace(1) %out.gep 671 ret void 672} 673 674; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f16: 675; GCN-DAG: {{buffer|flat}}_load_ushort [[A:v[0-9]+]] 676 677; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] 678; SI: v_max_f32_e32 [[MAX:v[0-9]+]], 0x3e230000, [[CVT]] 679; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[MAX]] 680 681; VI: v_max_f16_e64 [[NEG_QUIET:v[0-9]+]], -[[A]], -[[A]] 682; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], 0.15915494, [[NEG_QUIET]] 683 684; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 685define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 686 %tid = call i32 @llvm.amdgcn.workitem.id.x() 687 %tid.ext = sext i32 %tid to i64 688 %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext 689 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 690 %a = load volatile half, ptr addrspace(1) %a.gep 691 %min = call half @llvm.minnum.f16(half 0xHB118, half %a) 692 %fneg = fsub half -0.000000e+00, %min 693 store half %fneg, ptr addrspace(1) %out.gep 694 ret void 695} 696 697; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_f64: 698; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 699 700; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfc45f30 701; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 702; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 703; SI: v_max_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 704 705; VI: v_min_f64 v[[[RESULT_LO:[0-9]+]]:[[RESULT_HI:[0-9]+]]], [[A]], 0.15915494 706; VI: v_xor_b32_e32 v[[RESULT_HI]], 0x80000000, v[[RESULT_HI]] 707 708; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[RESULT_LO]]:[[RESULT_HI]]] 709define amdgpu_kernel void @v_fneg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 710 %tid = call i32 @llvm.amdgcn.workitem.id.x() 711 %tid.ext = sext i32 %tid to i64 712 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 713 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 714 %a = load volatile double, ptr addrspace(1) %a.gep 715 %min = call double @llvm.minnum.f64(double 0x3fc45f306dc9c882, double %a) 716 %fneg = fsub double -0.000000e+00, %min 717 store double %fneg, ptr addrspace(1) %out.gep 718 ret void 719} 720 721; GCN-LABEL: {{^}}v_fneg_neg_inv2pi_minnum_f64: 722; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 723 724; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fc45f30 725; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x6dc9c882 726; SI-DAG: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 727; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], s[[[K_LO]]:[[K_HI]]] 728 729; VI: v_max_f64 [[NEG_QUIET:v\[[0-9]+:[0-9]+\]]], -[[A]], -[[A]] 730; VI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[NEG_QUIET]], 0.15915494 731 732; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 733define amdgpu_kernel void @v_fneg_neg_inv2pi_minnum_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 734 %tid = call i32 @llvm.amdgcn.workitem.id.x() 735 %tid.ext = sext i32 %tid to i64 736 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 737 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 738 %a = load volatile double, ptr addrspace(1) %a.gep 739 %min = call double @llvm.minnum.f64(double 0xbfc45f306dc9c882, double %a) 740 %fneg = fsub double -0.000000e+00, %min 741 store double %fneg, ptr addrspace(1) %out.gep 742 ret void 743} 744 745; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32_no_ieee: 746; GCN-NOT: v0 747; GCN: v_max_f32_e64 v0, -v0, 0{{$}} 748; GCN-NEXT: ; return 749define amdgpu_ps float @v_fneg_neg0_minnum_f32_no_ieee(float %a) #0 { 750 %min = call float @llvm.minnum.f32(float -0.0, float %a) 751 %fneg = fneg float %min 752 ret float %fneg 753} 754 755; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_ieee: 756; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 757; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 758; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 759; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[QUIET_A]] 760; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 761; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 762define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 763 %tid = call i32 @llvm.amdgcn.workitem.id.x() 764 %tid.ext = sext i32 %tid to i64 765 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 766 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 767 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 768 %a = load volatile float, ptr addrspace(1) %a.gep 769 %b = load volatile float, ptr addrspace(1) %b.gep 770 %min = call float @llvm.minnum.f32(float 0.0, float %a) 771 %fneg = fneg float %min 772 %mul = fmul float %fneg, %b 773 store float %mul, ptr addrspace(1) %out.gep 774 ret void 775} 776 777; GCN-LABEL: {{^}}v_fneg_inv2pi_minnum_foldable_use_f32: 778; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 779; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 780 781; SI: v_mul_f32_e32 [[QUIET_NEG:v[0-9]+]], -1.0, [[A]] 782 783; SI: v_max_f32_e32 [[MIN:v[0-9]+]], 0xbe22f983, [[QUIET_NEG]] 784; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[MIN]], [[B]] 785 786; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]] 787; VI: v_min_f32_e32 [[MIN:v[0-9]+]], 0.15915494, [[QUIET]] 788; VI: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]] 789 790; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 791define amdgpu_kernel void @v_fneg_inv2pi_minnum_foldable_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 792 %tid = call i32 @llvm.amdgcn.workitem.id.x() 793 %tid.ext = sext i32 %tid to i64 794 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 795 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 796 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 797 %a = load volatile float, ptr addrspace(1) %a.gep 798 %b = load volatile float, ptr addrspace(1) %b.gep 799 %min = call float @llvm.minnum.f32(float 0x3FC45F3060000000, float %a) 800 %fneg = fneg float %min 801 %mul = fmul float %fneg, %b 802 store float %mul, ptr addrspace(1) %out.gep 803 ret void 804} 805 806; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32_no_ieee: 807; GCN-NOT: v0 808; GCN-NOT: v1 809; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, v0 810; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], v1 811; GCN-NEXT: ; return 812define amdgpu_ps float @v_fneg_0_minnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 813 %min = call float @llvm.minnum.f32(float 0.0, float %a) 814 %fneg = fneg float %min 815 %mul = fmul float %fneg, %b 816 ret float %mul 817} 818 819; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_ieee: 820; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 821; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 822; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 823; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 824; GCN: v_max_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 825; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 826; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 827; GCN-NEXT: s_waitcnt vmcnt(0) 828; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 829; GCN-NEXT: s_waitcnt vmcnt(0) 830define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 831 %tid = call i32 @llvm.amdgcn.workitem.id.x() 832 %tid.ext = sext i32 %tid to i64 833 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 834 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 835 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 836 %a = load volatile float, ptr addrspace(1) %a.gep 837 %b = load volatile float, ptr addrspace(1) %b.gep 838 %min = call float @llvm.minnum.f32(float %a, float %b) 839 %fneg = fneg float %min 840 %use1 = fmul float %min, 4.0 841 store volatile float %fneg, ptr addrspace(1) %out 842 store volatile float %use1, ptr addrspace(1) %out 843 ret void 844} 845 846; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32_no_ieee: 847; GCN-NOT: v0 848; GCN-NOT: v1 849; GCN: v_max_f32_e64 v0, -v0, -v1 850; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 851; GCN-NEXT: ; return 852define amdgpu_ps <2 x float> @v_fneg_minnum_multi_use_minnum_f32_no_ieee(float %a, float %b) #0 { 853 %min = call float @llvm.minnum.f32(float %a, float %b) 854 %fneg = fneg float %min 855 %use1 = fmul float %min, 4.0 856 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 857 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 858 ret <2 x float> %ins1 859} 860 861; -------------------------------------------------------------------------------- 862; fmaxnum tests 863; -------------------------------------------------------------------------------- 864 865 866; GCN-LABEL: {{^}}v_fneg_maxnum_f32_ieee: 867; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 868; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 869; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 870; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 871; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 872; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 873define amdgpu_kernel void @v_fneg_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 874 %tid = call i32 @llvm.amdgcn.workitem.id.x() 875 %tid.ext = sext i32 %tid to i64 876 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 877 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 878 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 879 %a = load volatile float, ptr addrspace(1) %a.gep 880 %b = load volatile float, ptr addrspace(1) %b.gep 881 %max = call float @llvm.maxnum.f32(float %a, float %b) 882 %fneg = fneg float %max 883 store float %fneg, ptr addrspace(1) %out.gep 884 ret void 885} 886 887; GCN-LABEL: {{^}}v_fneg_maxnum_f32_no_ieee: 888; GCN-NOT: v0 889; GCN-NOT: v1 890; GCN: v_min_f32_e64 v0, -v0, -v1 891; GCN-NEXT: ; return 892define amdgpu_ps float @v_fneg_maxnum_f32_no_ieee(float %a, float %b) #0 { 893 %max = call float @llvm.maxnum.f32(float %a, float %b) 894 %fneg = fneg float %max 895 ret float %fneg 896} 897 898; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_ieee: 899; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 900; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 901; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_A]] 902; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 903define amdgpu_kernel void @v_fneg_self_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 904 %tid = call i32 @llvm.amdgcn.workitem.id.x() 905 %tid.ext = sext i32 %tid to i64 906 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 907 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 908 %a = load volatile float, ptr addrspace(1) %a.gep 909 %max = call float @llvm.maxnum.f32(float %a, float %a) 910 %max.fneg = fneg float %max 911 store float %max.fneg, ptr addrspace(1) %out.gep 912 ret void 913} 914 915; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32_no_ieee: 916; GCN-NOT: v0 917; GCN: v_min_f32_e64 v0, -v0, -v0 918; GCN-NEXT: ; return 919define amdgpu_ps float @v_fneg_self_maxnum_f32_no_ieee(float %a) #0 { 920 %max = call float @llvm.maxnum.f32(float %a, float %a) 921 %max.fneg = fneg float %max 922 ret float %max.fneg 923} 924 925; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_ieee: 926; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 927; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 928; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], -4.0, [[QUIET_NEG_A]] 929; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 930define amdgpu_kernel void @v_fneg_posk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 931 %tid = call i32 @llvm.amdgcn.workitem.id.x() 932 %tid.ext = sext i32 %tid to i64 933 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 934 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 935 %a = load volatile float, ptr addrspace(1) %a.gep 936 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 937 %fneg = fneg float %max 938 store float %fneg, ptr addrspace(1) %out.gep 939 ret void 940} 941 942; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32_no_ieee: 943; GCN-NOT: v0 944; GCN: v_min_f32_e64 v0, -v0, -4.0 945; GCN-NEXT: ; return 946define amdgpu_ps float @v_fneg_posk_maxnum_f32_no_ieee(float %a) #0 { 947 %max = call float @llvm.maxnum.f32(float 4.0, float %a) 948 %fneg = fneg float %max 949 ret float %fneg 950} 951 952; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_ieee: 953; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 954; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 955; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 4.0, [[QUIET_NEG_A]] 956; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 957define amdgpu_kernel void @v_fneg_negk_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 958 %tid = call i32 @llvm.amdgcn.workitem.id.x() 959 %tid.ext = sext i32 %tid to i64 960 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 961 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 962 %a = load volatile float, ptr addrspace(1) %a.gep 963 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 964 %fneg = fneg float %max 965 store float %fneg, ptr addrspace(1) %out.gep 966 ret void 967} 968 969; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32_no_ieee: 970; GCN-NOT: v0 971; GCN: v_min_f32_e64 v0, -v0, 4.0 972; GCN-NEXT: ; return 973define amdgpu_ps float @v_fneg_negk_maxnum_f32_no_ieee(float %a) #0 { 974 %max = call float @llvm.maxnum.f32(float -4.0, float %a) 975 %fneg = fneg float %max 976 ret float %fneg 977} 978 979; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32: 980; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 981; GCN-NOT: [[A]] 982; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]] 983; GCN: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[MAX]] 984; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 985define amdgpu_kernel void @v_fneg_0_maxnum_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 986 %tid = call i32 @llvm.amdgcn.workitem.id.x() 987 %tid.ext = sext i32 %tid to i64 988 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 989 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 990 %a = load volatile float, ptr addrspace(1) %a.gep 991 %max = call nnan float @llvm.maxnum.f32(float 0.0, float %a) 992 %fneg = fneg float %max 993 store float %fneg, ptr addrspace(1) %out.gep 994 ret void 995} 996 997; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_ieee: 998; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 999; GCN: v_mul_f32_e32 [[QUIET_NEG_A:v[0-9]+]], -1.0, [[A]] 1000; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_NEG_A]] 1001; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1002define amdgpu_kernel void @v_fneg_neg0_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1003 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1004 %tid.ext = sext i32 %tid to i64 1005 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1006 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1007 %a = load volatile float, ptr addrspace(1) %a.gep 1008 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1009 %fneg = fneg float %max 1010 store float %fneg, ptr addrspace(1) %out.gep 1011 ret void 1012} 1013 1014; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32_no_ieee: 1015; GCN-NOT: v0 1016; GCN: v_min_f32_e64 v0, -v0, 0{{$}} 1017; GCN-NEXT: ; return 1018define amdgpu_ps float @v_fneg_neg0_maxnum_f32_no_ieee(float %a) #0 { 1019 %max = call float @llvm.maxnum.f32(float -0.0, float %a) 1020 %fneg = fneg float %max 1021 ret float %fneg 1022} 1023 1024; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_ieee: 1025; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1026; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1027; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]] 1028; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]] 1029; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]] 1030; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1031define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1032 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1033 %tid.ext = sext i32 %tid to i64 1034 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1035 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1036 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1037 %a = load volatile float, ptr addrspace(1) %a.gep 1038 %b = load volatile float, ptr addrspace(1) %b.gep 1039 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1040 %fneg = fneg float %max 1041 %mul = fmul float %fneg, %b 1042 store float %mul, ptr addrspace(1) %out.gep 1043 ret void 1044} 1045 1046; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32_no_ieee: 1047; GCN-NOT: v0 1048; GCN-NOT: v1 1049; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, v0 1050; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], v1 1051; GCN-NEXT: ; return 1052define amdgpu_ps float @v_fneg_0_maxnum_foldable_use_f32_no_ieee(float %a, float %b) #0 { 1053 %max = call float @llvm.maxnum.f32(float 0.0, float %a) 1054 %fneg = fneg float %max 1055 %mul = fmul float %fneg, %b 1056 ret float %mul 1057} 1058 1059; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_ieee: 1060; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1061; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1062; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_A:v[0-9]+]], -1.0, [[A]] 1063; GCN-DAG: v_mul_f32_e32 [[NEG_QUIET_B:v[0-9]+]], -1.0, [[B]] 1064; GCN: v_min_f32_e32 [[MAX0:v[0-9]+]], [[NEG_QUIET_A]], [[NEG_QUIET_B]] 1065; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MAX0]] 1066; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX0]] 1067; GCN-NEXT: s_waitcnt vmcnt(0) 1068; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 1069; GCN-NEXT: s_waitcnt vmcnt(0) 1070define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32_ieee(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1071 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1072 %tid.ext = sext i32 %tid to i64 1073 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1074 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1075 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1076 %a = load volatile float, ptr addrspace(1) %a.gep 1077 %b = load volatile float, ptr addrspace(1) %b.gep 1078 %max = call float @llvm.maxnum.f32(float %a, float %b) 1079 %fneg = fneg float %max 1080 %use1 = fmul float %max, 4.0 1081 store volatile float %fneg, ptr addrspace(1) %out 1082 store volatile float %use1, ptr addrspace(1) %out 1083 ret void 1084} 1085 1086; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32_no_ieee: 1087; GCN-NOT: v0 1088; GCN-NOT: v1 1089; GCN: v_min_f32_e64 v0, -v0, -v1 1090; GCN-NEXT: v_mul_f32_e32 v1, -4.0, v0 1091; GCN-NEXT: ; return 1092define amdgpu_ps <2 x float> @v_fneg_maxnum_multi_use_maxnum_f32_no_ieee(float %a, float %b) #0 { 1093 %max = call float @llvm.maxnum.f32(float %a, float %b) 1094 %fneg = fneg float %max 1095 %use1 = fmul float %max, 4.0 1096 %ins0 = insertelement <2 x float> undef, float %fneg, i32 0 1097 %ins1 = insertelement <2 x float> %ins0, float %use1, i32 1 1098 ret <2 x float> %ins1 1099} 1100 1101; -------------------------------------------------------------------------------- 1102; fma tests 1103; -------------------------------------------------------------------------------- 1104 1105; GCN-LABEL: {{^}}v_fneg_fma_f32: 1106; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1107; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1108; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1109 1110; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] 1111; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]] 1112 1113; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1114; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1115define amdgpu_kernel void @v_fneg_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1116 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1117 %tid.ext = sext i32 %tid to i64 1118 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1119 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1120 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1121 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1122 %a = load volatile float, ptr addrspace(1) %a.gep 1123 %b = load volatile float, ptr addrspace(1) %b.gep 1124 %c = load volatile float, ptr addrspace(1) %c.gep 1125 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1126 %fneg = fneg float %fma 1127 store float %fneg, ptr addrspace(1) %out.gep 1128 ret void 1129} 1130 1131; GCN-LABEL: {{^}}v_fneg_fma_store_use_fma_f32: 1132; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1133; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1134; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1135; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1136; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1137; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1138; GCN-NEXT: s_waitcnt vmcnt(0) 1139; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1140; GCN-NEXT: s_waitcnt vmcnt(0) 1141define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1142 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1143 %tid.ext = sext i32 %tid to i64 1144 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1145 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1146 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1147 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1148 %a = load volatile float, ptr addrspace(1) %a.gep 1149 %b = load volatile float, ptr addrspace(1) %b.gep 1150 %c = load volatile float, ptr addrspace(1) %c.gep 1151 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1152 %fneg = fneg float %fma 1153 store volatile float %fneg, ptr addrspace(1) %out 1154 store volatile float %fma, ptr addrspace(1) %out 1155 ret void 1156} 1157 1158; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fma_f32: 1159; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1160; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1161; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1162 1163; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1164; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]] 1165; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]] 1166 1167; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1168; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]] 1169 1170; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1171; GCN-NEXT: s_waitcnt vmcnt(0) 1172; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1173; GCN-NEXT: s_waitcnt vmcnt(0) 1174define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1175 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1176 %tid.ext = sext i32 %tid to i64 1177 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1178 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1179 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1180 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1181 %a = load volatile float, ptr addrspace(1) %a.gep 1182 %b = load volatile float, ptr addrspace(1) %b.gep 1183 %c = load volatile float, ptr addrspace(1) %c.gep 1184 %fma = call float @llvm.fma.f32(float %a, float %b, float %c) 1185 %fneg = fneg float %fma 1186 %use1 = fmul float %fma, 4.0 1187 store volatile float %fneg, ptr addrspace(1) %out 1188 store volatile float %use1, ptr addrspace(1) %out 1189 ret void 1190} 1191 1192; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_y_f32: 1193; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1194; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1195; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1196 1197; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]] 1198; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1199 1200; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1201; GCN-NSZ-NOT: [[FMA]] 1202; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1203define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1204 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1205 %tid.ext = sext i32 %tid to i64 1206 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1207 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1208 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1209 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1210 %a = load volatile float, ptr addrspace(1) %a.gep 1211 %b = load volatile float, ptr addrspace(1) %b.gep 1212 %c = load volatile float, ptr addrspace(1) %c.gep 1213 %fneg.a = fneg float %a 1214 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1215 %fneg = fneg float %fma 1216 store volatile float %fneg, ptr addrspace(1) %out 1217 ret void 1218} 1219 1220; GCN-LABEL: {{^}}v_fneg_fma_x_fneg_y_f32: 1221; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1222; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1223; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1224 1225; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1226; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1227 1228; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1229; GCN-NSZ-NOT: [[FMA]] 1230; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1231define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1232 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1233 %tid.ext = sext i32 %tid to i64 1234 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1235 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1236 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1237 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1238 %a = load volatile float, ptr addrspace(1) %a.gep 1239 %b = load volatile float, ptr addrspace(1) %b.gep 1240 %c = load volatile float, ptr addrspace(1) %c.gep 1241 %fneg.b = fneg float %b 1242 %fma = call float @llvm.fma.f32(float %a, float %fneg.b, float %c) 1243 %fneg = fneg float %fma 1244 store volatile float %fneg, ptr addrspace(1) %out 1245 ret void 1246} 1247 1248; GCN-LABEL: {{^}}v_fneg_fma_fneg_fneg_y_f32: 1249; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1250; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1251; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1252 1253; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1254; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1255 1256; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] 1257; GCN-NSZ-NOT: [[FMA]] 1258; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1259define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1260 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1261 %tid.ext = sext i32 %tid to i64 1262 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1263 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1264 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1265 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1266 %a = load volatile float, ptr addrspace(1) %a.gep 1267 %b = load volatile float, ptr addrspace(1) %b.gep 1268 %c = load volatile float, ptr addrspace(1) %c.gep 1269 %fneg.a = fneg float %a 1270 %fneg.b = fneg float %b 1271 %fma = call float @llvm.fma.f32(float %fneg.a, float %fneg.b, float %c) 1272 %fneg = fneg float %fma 1273 store volatile float %fneg, ptr addrspace(1) %out 1274 ret void 1275} 1276 1277; GCN-LABEL: {{^}}v_fneg_fma_fneg_x_fneg_f32: 1278; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1279; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1280; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1281 1282; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]] 1283; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1284 1285; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] 1286; GCN-NSZ-NOT: [[FMA]] 1287; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1288define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1289 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1290 %tid.ext = sext i32 %tid to i64 1291 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1292 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1293 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1294 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1295 %a = load volatile float, ptr addrspace(1) %a.gep 1296 %b = load volatile float, ptr addrspace(1) %b.gep 1297 %c = load volatile float, ptr addrspace(1) %c.gep 1298 %fneg.a = fneg float %a 1299 %fneg.c = fneg float %c 1300 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %fneg.c) 1301 %fneg = fneg float %fma 1302 store volatile float %fneg, ptr addrspace(1) %out 1303 ret void 1304} 1305 1306; GCN-LABEL: {{^}}v_fneg_fma_x_y_fneg_f32: 1307; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1308; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1309; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1310 1311; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1312; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1313 1314; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]] 1315; GCN-NSZ-NOT: [[FMA]] 1316; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1317define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1318 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1319 %tid.ext = sext i32 %tid to i64 1320 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1321 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1322 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1323 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1324 %a = load volatile float, ptr addrspace(1) %a.gep 1325 %b = load volatile float, ptr addrspace(1) %b.gep 1326 %c = load volatile float, ptr addrspace(1) %c.gep 1327 %fneg.c = fneg float %c 1328 %fma = call float @llvm.fma.f32(float %a, float %b, float %fneg.c) 1329 %fneg = fneg float %fma 1330 store volatile float %fneg, ptr addrspace(1) %out 1331 ret void 1332} 1333 1334; GCN-LABEL: {{^}}v_fneg_fma_store_use_fneg_x_y_f32: 1335; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1336; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1337; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1338 1339; GCN-SAFE: v_xor_b32 1340; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], 1341; GCN-SAFE: v_xor_b32 1342 1343; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1344; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1345 1346; GCN-NSZ-NOT: [[FMA]] 1347; GCN-NSZ-NOT: [[NEG_A]] 1348; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA]] 1349; GCN-NSZ-NOT: [[NEG_A]] 1350; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1351define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1352 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1353 %tid.ext = sext i32 %tid to i64 1354 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1355 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1356 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1357 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1358 %a = load volatile float, ptr addrspace(1) %a.gep 1359 %b = load volatile float, ptr addrspace(1) %b.gep 1360 %c = load volatile float, ptr addrspace(1) %c.gep 1361 %fneg.a = fneg float %a 1362 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1363 %fneg = fneg float %fma 1364 store volatile float %fneg, ptr addrspace(1) %out 1365 store volatile float %fneg.a, ptr addrspace(1) %out 1366 ret void 1367} 1368 1369; GCN-LABEL: {{^}}v_fneg_fma_multi_use_fneg_x_y_f32: 1370; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1371; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1372; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1373 1374; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1375; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]] 1376; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]] 1377 1378; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]] 1379; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]] 1380; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1381; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1382; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) 1383define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, float %d) #0 { 1384 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1385 %tid.ext = sext i32 %tid to i64 1386 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1387 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1388 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1389 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1390 %a = load volatile float, ptr addrspace(1) %a.gep 1391 %b = load volatile float, ptr addrspace(1) %b.gep 1392 %c = load volatile float, ptr addrspace(1) %c.gep 1393 %fneg.a = fneg float %a 1394 %fma = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 1395 %fneg = fneg float %fma 1396 %use1 = fmul float %fneg.a, %d 1397 store volatile float %fneg, ptr addrspace(1) %out 1398 store volatile float %use1, ptr addrspace(1) %out 1399 ret void 1400} 1401 1402; -------------------------------------------------------------------------------- 1403; fmad tests 1404; -------------------------------------------------------------------------------- 1405 1406; GCN-LABEL: {{^}}v_fneg_fmad_f32: 1407; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1408; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1409; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1410 1411; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1412; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]] 1413 1414; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]] 1415; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1416define amdgpu_kernel void @v_fneg_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1417 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1418 %tid.ext = sext i32 %tid to i64 1419 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1420 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1421 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1422 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1423 %a = load volatile float, ptr addrspace(1) %a.gep 1424 %b = load volatile float, ptr addrspace(1) %b.gep 1425 %c = load volatile float, ptr addrspace(1) %c.gep 1426 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1427 %fneg = fneg float %fma 1428 store float %fneg, ptr addrspace(1) %out.gep 1429 ret void 1430} 1431 1432; GCN-LABEL: {{^}}v_fneg_fmad_v4f32: 1433 1434; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1435; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1436; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1437; GCN-NSZ: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} 1438define amdgpu_kernel void @v_fneg_fmad_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1439 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1440 %tid.ext = sext i32 %tid to i64 1441 %a.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a.ptr, i64 %tid.ext 1442 %b.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %b.ptr, i64 %tid.ext 1443 %c.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %c.ptr, i64 %tid.ext 1444 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext 1445 %a = load volatile <4 x float>, ptr addrspace(1) %a.gep 1446 %b = load volatile <4 x float>, ptr addrspace(1) %b.gep 1447 %c = load volatile <4 x float>, ptr addrspace(1) %c.gep 1448 %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) 1449 %fneg = fneg <4 x float> %fma 1450 store <4 x float> %fneg, ptr addrspace(1) %out.gep 1451 ret void 1452} 1453 1454; GCN-LABEL: {{^}}v_fneg_fmad_multi_use_fmad_f32: 1455; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1456; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1457; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 1458 1459; GCN-SAFE: v_mac_f32_e32 [[C]], [[A]], [[B]] 1460; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]] 1461; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]] 1462 1463; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]] 1464; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]] 1465 1466; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]] 1467; GCN-NEXT: s_waitcnt vmcnt(0) 1468; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1469; GCN-NEXT: s_waitcnt vmcnt(0) 1470define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 1471 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1472 %tid.ext = sext i32 %tid to i64 1473 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1474 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1475 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 1476 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1477 %a = load volatile float, ptr addrspace(1) %a.gep 1478 %b = load volatile float, ptr addrspace(1) %b.gep 1479 %c = load volatile float, ptr addrspace(1) %c.gep 1480 %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c) 1481 %fneg = fneg float %fma 1482 %use1 = fmul float %fma, 4.0 1483 store volatile float %fneg, ptr addrspace(1) %out 1484 store volatile float %use1, ptr addrspace(1) %out 1485 ret void 1486} 1487 1488; -------------------------------------------------------------------------------- 1489; fp_extend tests 1490; -------------------------------------------------------------------------------- 1491 1492; GCN-LABEL: {{^}}v_fneg_fp_extend_f32_to_f64: 1493; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1494; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]] 1495; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1496define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1497 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1498 %tid.ext = sext i32 %tid to i64 1499 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1500 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1501 %a = load volatile float, ptr addrspace(1) %a.gep 1502 %fpext = fpext float %a to double 1503 %fneg = fsub double -0.000000e+00, %fpext 1504 store double %fneg, ptr addrspace(1) %out.gep 1505 ret void 1506} 1507 1508; GCN-LABEL: {{^}}v_fneg_fp_extend_fneg_f32_to_f64: 1509; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1510; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1511; GCN: {{buffer|flat}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1512define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1513 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1514 %tid.ext = sext i32 %tid to i64 1515 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1516 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1517 %a = load volatile float, ptr addrspace(1) %a.gep 1518 %fneg.a = fneg float %a 1519 %fpext = fpext float %fneg.a to double 1520 %fneg = fsub double -0.000000e+00, %fpext 1521 store double %fneg, ptr addrspace(1) %out.gep 1522 ret void 1523} 1524 1525; GCN-LABEL: {{^}}v_fneg_fp_extend_store_use_fneg_f32_to_f64: 1526; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1527; GCN-DAG: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]] 1528; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]] 1529; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1530; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FNEG_A]] 1531define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1532 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1533 %tid.ext = sext i32 %tid to i64 1534 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1535 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1536 %a = load volatile float, ptr addrspace(1) %a.gep 1537 %fneg.a = fneg float %a 1538 %fpext = fpext float %fneg.a to double 1539 %fneg = fsub double -0.000000e+00, %fpext 1540 store volatile double %fneg, ptr addrspace(1) %out.gep 1541 store volatile float %fneg.a, ptr addrspace(1) undef 1542 ret void 1543} 1544 1545; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f32_to_f64: 1546; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1547; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1548; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1549; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1550; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[CVT_LO]]:[[CVT_HI]]] 1551define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1552 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1553 %tid.ext = sext i32 %tid to i64 1554 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1555 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1556 %a = load volatile float, ptr addrspace(1) %a.gep 1557 %fpext = fpext float %a to double 1558 %fneg = fsub double -0.000000e+00, %fpext 1559 store volatile double %fneg, ptr addrspace(1) %out.gep 1560 store volatile double %fpext, ptr addrspace(1) undef 1561 ret void 1562} 1563 1564; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64: 1565; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1566; GCN-DAG: v_cvt_f64_f32_e32 v[[[CVT_LO:[0-9]+]]:[[CVT_HI:[0-9]+]]], [[A]] 1567; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]] 1568; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v[[[CVT_LO]]:[[CVT_HI]]], 4.0 1569; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[FNEG_A]]] 1570; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1571define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1572 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1573 %tid.ext = sext i32 %tid to i64 1574 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1575 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1576 %a = load volatile float, ptr addrspace(1) %a.gep 1577 %fpext = fpext float %a to double 1578 %fneg = fsub double -0.000000e+00, %fpext 1579 %mul = fmul double %fpext, 4.0 1580 store volatile double %fneg, ptr addrspace(1) %out.gep 1581 store volatile double %mul, ptr addrspace(1) %out.gep 1582 ret void 1583} 1584 1585; FIXME: Source modifiers not folded for f16->f32 1586; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32: 1587define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1588 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1589 %tid.ext = sext i32 %tid to i64 1590 %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext 1591 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1592 %a = load volatile half, ptr addrspace(1) %a.gep 1593 %fpext = fpext half %a to float 1594 %fneg = fneg float %fpext 1595 store volatile float %fneg, ptr addrspace(1) %out.gep 1596 store volatile float %fpext, ptr addrspace(1) %out.gep 1597 ret void 1598} 1599 1600; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32: 1601define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1602 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1603 %tid.ext = sext i32 %tid to i64 1604 %a.gep = getelementptr inbounds half, ptr addrspace(1) %a.ptr, i64 %tid.ext 1605 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1606 %a = load volatile half, ptr addrspace(1) %a.gep 1607 %fpext = fpext half %a to float 1608 %fneg = fneg float %fpext 1609 %mul = fmul float %fpext, 4.0 1610 store volatile float %fneg, ptr addrspace(1) %out.gep 1611 store volatile float %mul, ptr addrspace(1) %out.gep 1612 ret void 1613} 1614 1615; -------------------------------------------------------------------------------- 1616; fp_round tests 1617; -------------------------------------------------------------------------------- 1618 1619; GCN-LABEL: {{^}}v_fneg_fp_round_f64_to_f32: 1620; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1621; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]] 1622; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1623define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1624 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1625 %tid.ext = sext i32 %tid to i64 1626 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 1627 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1628 %a = load volatile double, ptr addrspace(1) %a.gep 1629 %fpround = fptrunc double %a to float 1630 %fneg = fneg float %fpround 1631 store float %fneg, ptr addrspace(1) %out.gep 1632 ret void 1633} 1634 1635; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f64_to_f32: 1636; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1637; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1638; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1639define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1640 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1641 %tid.ext = sext i32 %tid to i64 1642 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 1643 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1644 %a = load volatile double, ptr addrspace(1) %a.gep 1645 %fneg.a = fsub double -0.000000e+00, %a 1646 %fpround = fptrunc double %fneg.a to float 1647 %fneg = fneg float %fpround 1648 store float %fneg, ptr addrspace(1) %out.gep 1649 ret void 1650} 1651 1652; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f64_to_f32: 1653; GCN: {{buffer|flat}}_load_dwordx2 v[[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]] 1654; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v[[[A_LO]]:[[A_HI]]] 1655; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]] 1656; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1657; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[[[A_LO]]:[[NEG_A_HI]]] 1658define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1659 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1660 %tid.ext = sext i32 %tid to i64 1661 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 1662 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1663 %a = load volatile double, ptr addrspace(1) %a.gep 1664 %fneg.a = fsub double -0.000000e+00, %a 1665 %fpround = fptrunc double %fneg.a to float 1666 %fneg = fneg float %fpround 1667 store volatile float %fneg, ptr addrspace(1) %out.gep 1668 store volatile double %fneg.a, ptr addrspace(1) undef 1669 ret void 1670} 1671 1672; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f64_to_f32: 1673; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1674; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]] 1675; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s[ 1676 1677; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1678; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1679define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, double %c) #0 { 1680 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1681 %tid.ext = sext i32 %tid to i64 1682 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 1683 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1684 %a = load volatile double, ptr addrspace(1) %a.gep 1685 %fneg.a = fsub double -0.000000e+00, %a 1686 %fpround = fptrunc double %fneg.a to float 1687 %fneg = fneg float %fpround 1688 %use1 = fmul double %fneg.a, %c 1689 store volatile float %fneg, ptr addrspace(1) %out.gep 1690 store volatile double %use1, ptr addrspace(1) undef 1691 ret void 1692} 1693 1694; GCN-LABEL: {{^}}v_fneg_fp_round_f32_to_f16: 1695; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1696; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1697; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1698define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1699 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1700 %tid.ext = sext i32 %tid to i64 1701 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1702 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 1703 %a = load volatile float, ptr addrspace(1) %a.gep 1704 %fpround = fptrunc float %a to half 1705 %fneg = fsub half -0.000000e+00, %fpround 1706 store half %fneg, ptr addrspace(1) %out.gep 1707 ret void 1708} 1709 1710; GCN-LABEL: {{^}}v_fneg_fp_round_fneg_f32_to_f16: 1711; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1712; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1713; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1714define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1715 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1716 %tid.ext = sext i32 %tid to i64 1717 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1718 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 1719 %a = load volatile float, ptr addrspace(1) %a.gep 1720 %fneg.a = fneg float %a 1721 %fpround = fptrunc float %fneg.a to half 1722 %fneg = fsub half -0.000000e+00, %fpround 1723 store half %fneg, ptr addrspace(1) %out.gep 1724 ret void 1725} 1726 1727; GCN-LABEL: {{^}}v_fneg_multi_use_fp_round_fneg_f64_to_f32: 1728; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 1729; GCN-DAG: v_cvt_f32_f64_e32 [[CVT:v[0-9]+]], [[A]] 1730; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]] 1731; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG]] 1732; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[CVT]] 1733define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1734 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1735 %tid.ext = sext i32 %tid to i64 1736 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 1737 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1738 %a = load volatile double, ptr addrspace(1) %a.gep 1739 %fpround = fptrunc double %a to float 1740 %fneg = fneg float %fpround 1741 store volatile float %fneg, ptr addrspace(1) %out.gep 1742 store volatile float %fpround, ptr addrspace(1) %out.gep 1743 ret void 1744} 1745 1746; GCN-LABEL: {{^}}v_fneg_fp_round_store_use_fneg_f32_to_f16: 1747; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1748; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1749; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1750; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1751; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1752define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1753 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1754 %tid.ext = sext i32 %tid to i64 1755 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1756 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 1757 %a = load volatile float, ptr addrspace(1) %a.gep 1758 %fneg.a = fneg float %a 1759 %fpround = fptrunc float %fneg.a to half 1760 %fneg = fsub half -0.000000e+00, %fpround 1761 store volatile half %fneg, ptr addrspace(1) %out.gep 1762 store volatile float %fneg.a, ptr addrspace(1) undef 1763 ret void 1764} 1765 1766; GCN-LABEL: {{^}}v_fneg_fp_round_multi_use_fneg_f32_to_f16: 1767; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1768; GCN-DAG: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1769; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s 1770; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1771; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[USE1]] 1772define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 { 1773 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1774 %tid.ext = sext i32 %tid to i64 1775 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1776 %out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext 1777 %a = load volatile float, ptr addrspace(1) %a.gep 1778 %fneg.a = fneg float %a 1779 %fpround = fptrunc float %fneg.a to half 1780 %fneg = fsub half -0.000000e+00, %fpround 1781 %use1 = fmul float %fneg.a, %c 1782 store volatile half %fneg, ptr addrspace(1) %out.gep 1783 store volatile float %use1, ptr addrspace(1) undef 1784 ret void 1785} 1786 1787; -------------------------------------------------------------------------------- 1788; rcp tests 1789; -------------------------------------------------------------------------------- 1790 1791; GCN-LABEL: {{^}}v_fneg_rcp_f32: 1792; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1793; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 1794; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1795define amdgpu_kernel void @v_fneg_rcp_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1796 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1797 %tid.ext = sext i32 %tid to i64 1798 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1799 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1800 %a = load volatile float, ptr addrspace(1) %a.gep 1801 %rcp = call float @llvm.amdgcn.rcp.f32(float %a) 1802 %fneg = fneg float %rcp 1803 store float %fneg, ptr addrspace(1) %out.gep 1804 ret void 1805} 1806 1807; GCN-LABEL: {{^}}v_fneg_rcp_fneg_f32: 1808; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1809; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1810; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1811define amdgpu_kernel void @v_fneg_rcp_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1812 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1813 %tid.ext = sext i32 %tid to i64 1814 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1815 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1816 %a = load volatile float, ptr addrspace(1) %a.gep 1817 %fneg.a = fneg float %a 1818 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1819 %fneg = fneg float %rcp 1820 store float %fneg, ptr addrspace(1) %out.gep 1821 ret void 1822} 1823 1824; GCN-LABEL: {{^}}v_fneg_rcp_store_use_fneg_f32: 1825; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1826; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1827; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 1828; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1829; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 1830define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 1831 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1832 %tid.ext = sext i32 %tid to i64 1833 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1834 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1835 %a = load volatile float, ptr addrspace(1) %a.gep 1836 %fneg.a = fneg float %a 1837 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1838 %fneg = fneg float %rcp 1839 store volatile float %fneg, ptr addrspace(1) %out.gep 1840 store volatile float %fneg.a, ptr addrspace(1) undef 1841 ret void 1842} 1843 1844; GCN-LABEL: {{^}}v_fneg_rcp_multi_use_fneg_f32: 1845; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1846; GCN-DAG: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]] 1847; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 1848; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1849; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1850define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, float %c) #0 { 1851 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1852 %tid.ext = sext i32 %tid to i64 1853 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1854 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1855 %a = load volatile float, ptr addrspace(1) %a.gep 1856 %fneg.a = fneg float %a 1857 %rcp = call float @llvm.amdgcn.rcp.f32(float %fneg.a) 1858 %fneg = fneg float %rcp 1859 %use1 = fmul float %fneg.a, %c 1860 store volatile float %fneg, ptr addrspace(1) %out.gep 1861 store volatile float %use1, ptr addrspace(1) undef 1862 ret void 1863} 1864 1865; -------------------------------------------------------------------------------- 1866; fmul_legacy tests 1867; -------------------------------------------------------------------------------- 1868 1869; GCN-LABEL: {{^}}v_fneg_mul_legacy_f32: 1870; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1871; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1872; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]] 1873; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 1874define amdgpu_kernel void @v_fneg_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1875 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1876 %tid.ext = sext i32 %tid to i64 1877 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1878 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1879 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1880 %a = load volatile float, ptr addrspace(1) %a.gep 1881 %b = load volatile float, ptr addrspace(1) %b.gep 1882 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1883 %fneg = fneg float %mul 1884 store float %fneg, ptr addrspace(1) %out.gep 1885 ret void 1886} 1887 1888; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_mul_legacy_f32: 1889; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1890; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1891; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1892; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] 1893; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 1894; GCN-NEXT: s_waitcnt vmcnt(0) 1895; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1896; GCN-NEXT: s_waitcnt vmcnt(0) 1897define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1898 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1899 %tid.ext = sext i32 %tid to i64 1900 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1901 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1902 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1903 %a = load volatile float, ptr addrspace(1) %a.gep 1904 %b = load volatile float, ptr addrspace(1) %b.gep 1905 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1906 %fneg = fneg float %mul 1907 store volatile float %fneg, ptr addrspace(1) %out 1908 store volatile float %mul, ptr addrspace(1) %out 1909 ret void 1910} 1911 1912; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: 1913; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1914; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1915; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1916; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 1917; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1918; GCN-NEXT: s_waitcnt vmcnt(0) 1919; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 1920; GCN-NEXT: s_waitcnt vmcnt(0) 1921define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1922 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1923 %tid.ext = sext i32 %tid to i64 1924 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1925 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1926 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1927 %a = load volatile float, ptr addrspace(1) %a.gep 1928 %b = load volatile float, ptr addrspace(1) %b.gep 1929 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b) 1930 %fneg = fneg float %mul 1931 %use1 = call float @llvm.amdgcn.fmul.legacy(float %mul, float 4.0) 1932 store volatile float %fneg, ptr addrspace(1) %out 1933 store volatile float %use1, ptr addrspace(1) %out 1934 ret void 1935} 1936 1937; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_x_f32: 1938; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1939; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1940; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1941; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1942define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1943 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1944 %tid.ext = sext i32 %tid to i64 1945 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1946 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1947 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1948 %a = load volatile float, ptr addrspace(1) %a.gep 1949 %b = load volatile float, ptr addrspace(1) %b.gep 1950 %fneg.a = fneg float %a 1951 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 1952 %fneg = fneg float %mul 1953 store volatile float %fneg, ptr addrspace(1) %out 1954 ret void 1955} 1956 1957; GCN-LABEL: {{^}}v_fneg_mul_legacy_x_fneg_f32: 1958; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1959; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1960; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] 1961; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1962define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1963 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1964 %tid.ext = sext i32 %tid to i64 1965 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1966 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1967 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1968 %a = load volatile float, ptr addrspace(1) %a.gep 1969 %b = load volatile float, ptr addrspace(1) %b.gep 1970 %fneg.b = fneg float %b 1971 %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %fneg.b) 1972 %fneg = fneg float %mul 1973 store volatile float %fneg, ptr addrspace(1) %out 1974 ret void 1975} 1976 1977; GCN-LABEL: {{^}}v_fneg_mul_legacy_fneg_fneg_f32: 1978; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 1979; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 1980; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] 1981; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] 1982define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 1983 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1984 %tid.ext = sext i32 %tid to i64 1985 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 1986 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 1987 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 1988 %a = load volatile float, ptr addrspace(1) %a.gep 1989 %b = load volatile float, ptr addrspace(1) %b.gep 1990 %fneg.a = fneg float %a 1991 %fneg.b = fneg float %b 1992 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %fneg.b) 1993 %fneg = fneg float %mul 1994 store volatile float %fneg, ptr addrspace(1) %out 1995 ret void 1996} 1997 1998; GCN-LABEL: {{^}}v_fneg_mul_legacy_store_use_fneg_x_f32: 1999; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2000; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2001; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] 2002; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 2003; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 2004; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_A]] 2005define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 2006 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2007 %tid.ext = sext i32 %tid to i64 2008 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2009 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2010 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2011 %a = load volatile float, ptr addrspace(1) %a.gep 2012 %b = load volatile float, ptr addrspace(1) %b.gep 2013 %fneg.a = fneg float %a 2014 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2015 %fneg = fneg float %mul 2016 store volatile float %fneg, ptr addrspace(1) %out 2017 store volatile float %fneg.a, ptr addrspace(1) %out 2018 ret void 2019} 2020 2021; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_fneg_x_f32: 2022; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2023; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2024; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[A]], [[B]] 2025; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} 2026; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MUL_LEGACY]] 2027; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2028define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, float %c) #0 { 2029 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2030 %tid.ext = sext i32 %tid to i64 2031 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2032 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2033 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2034 %a = load volatile float, ptr addrspace(1) %a.gep 2035 %b = load volatile float, ptr addrspace(1) %b.gep 2036 %fneg.a = fneg float %a 2037 %mul = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %b) 2038 %fneg = fneg float %mul 2039 %use1 = call float @llvm.amdgcn.fmul.legacy(float %fneg.a, float %c) 2040 store volatile float %fneg, ptr addrspace(1) %out 2041 store volatile float %use1, ptr addrspace(1) %out 2042 ret void 2043} 2044 2045; -------------------------------------------------------------------------------- 2046; sin tests 2047; -------------------------------------------------------------------------------- 2048 2049; GCN-LABEL: {{^}}v_fneg_sin_f32: 2050; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2051; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbe22f983, [[A]] 2052; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]] 2053; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]] 2054; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2055define amdgpu_kernel void @v_fneg_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2056 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2057 %tid.ext = sext i32 %tid to i64 2058 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2059 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2060 %a = load volatile float, ptr addrspace(1) %a.gep 2061 %sin = call float @llvm.sin.f32(float %a) 2062 %fneg = fneg float %sin 2063 store float %fneg, ptr addrspace(1) %out.gep 2064 ret void 2065} 2066 2067; GCN-LABEL: {{^}}v_fneg_amdgcn_sin_f32: 2068; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2069; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2070; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2071define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2072 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2073 %tid.ext = sext i32 %tid to i64 2074 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2075 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2076 %a = load volatile float, ptr addrspace(1) %a.gep 2077 %sin = call float @llvm.amdgcn.sin.f32(float %a) 2078 %fneg = fneg float %sin 2079 store float %fneg, ptr addrspace(1) %out.gep 2080 ret void 2081} 2082 2083; -------------------------------------------------------------------------------- 2084; ftrunc tests 2085; -------------------------------------------------------------------------------- 2086 2087; GCN-LABEL: {{^}}v_fneg_trunc_f32: 2088; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2089; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2090; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2091define amdgpu_kernel void @v_fneg_trunc_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2092 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2093 %tid.ext = sext i32 %tid to i64 2094 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2095 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2096 %a = load volatile float, ptr addrspace(1) %a.gep 2097 %trunc = call float @llvm.trunc.f32(float %a) 2098 %fneg = fneg float %trunc 2099 store float %fneg, ptr addrspace(1) %out.gep 2100 ret void 2101} 2102 2103; -------------------------------------------------------------------------------- 2104; fround tests 2105; -------------------------------------------------------------------------------- 2106 2107; GCN-LABEL: {{^}}v_fneg_round_f32: 2108; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2109; GCN: v_trunc_f32_e32 2110; GCN: v_sub_f32_e32 2111; GCN: v_cndmask_b32 2112 2113; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} 2114; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]] 2115 2116; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}} 2117; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2118define amdgpu_kernel void @v_fneg_round_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2119 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2120 %tid.ext = sext i32 %tid to i64 2121 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2122 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2123 %a = load volatile float, ptr addrspace(1) %a.gep 2124 %round = call float @llvm.round.f32(float %a) 2125 %fneg = fneg float %round 2126 store float %fneg, ptr addrspace(1) %out.gep 2127 ret void 2128} 2129 2130; -------------------------------------------------------------------------------- 2131; rint tests 2132; -------------------------------------------------------------------------------- 2133 2134; GCN-LABEL: {{^}}v_fneg_rint_f32: 2135; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2136; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2137; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2138define amdgpu_kernel void @v_fneg_rint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2139 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2140 %tid.ext = sext i32 %tid to i64 2141 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2142 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2143 %a = load volatile float, ptr addrspace(1) %a.gep 2144 %rint = call float @llvm.rint.f32(float %a) 2145 %fneg = fneg float %rint 2146 store float %fneg, ptr addrspace(1) %out.gep 2147 ret void 2148} 2149 2150; -------------------------------------------------------------------------------- 2151; nearbyint tests 2152; -------------------------------------------------------------------------------- 2153 2154; GCN-LABEL: {{^}}v_fneg_nearbyint_f32: 2155; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2156; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]] 2157; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2158define amdgpu_kernel void @v_fneg_nearbyint_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2159 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2160 %tid.ext = sext i32 %tid to i64 2161 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2162 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2163 %a = load volatile float, ptr addrspace(1) %a.gep 2164 %nearbyint = call float @llvm.nearbyint.f32(float %a) 2165 %fneg = fneg float %nearbyint 2166 store float %fneg, ptr addrspace(1) %out.gep 2167 ret void 2168} 2169 2170; -------------------------------------------------------------------------------- 2171; fcanonicalize tests 2172; -------------------------------------------------------------------------------- 2173 2174; GCN-LABEL: {{^}}v_fneg_canonicalize_f32: 2175; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2176; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]] 2177; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] 2178define amdgpu_kernel void @v_fneg_canonicalize_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 2179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2180 %tid.ext = sext i32 %tid to i64 2181 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2182 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2183 %a = load volatile float, ptr addrspace(1) %a.gep 2184 %trunc = call float @llvm.canonicalize.f32(float %a) 2185 %fneg = fneg float %trunc 2186 store float %fneg, ptr addrspace(1) %out.gep 2187 ret void 2188} 2189 2190; -------------------------------------------------------------------------------- 2191; vintrp tests 2192; -------------------------------------------------------------------------------- 2193 2194; GCN-LABEL: {{^}}v_fneg_interp_p1_f32: 2195; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2196; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2197; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2198; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2199; GCN: v_interp_p1_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2200define amdgpu_kernel void @v_fneg_interp_p1_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 2201 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2202 %tid.ext = sext i32 %tid to i64 2203 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2204 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2205 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2206 %a = load volatile float, ptr addrspace(1) %a.gep 2207 %b = load volatile float, ptr addrspace(1) %b.gep 2208 %mul = fmul float %a, %b 2209 %fneg = fneg float %mul 2210 %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0) 2211 %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0) 2212 store volatile float %intrp0, ptr addrspace(1) %out.gep 2213 store volatile float %intrp1, ptr addrspace(1) %out.gep 2214 ret void 2215} 2216 2217; GCN-LABEL: {{^}}v_fneg_interp_p2_f32: 2218; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2219; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2220; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2221; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2222; GCN: v_interp_p2_f32{{(_e32)?}} v{{[0-9]+}}, [[MUL]] 2223define amdgpu_kernel void @v_fneg_interp_p2_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 2224 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2225 %tid.ext = sext i32 %tid to i64 2226 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2227 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2228 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2229 %a = load volatile float, ptr addrspace(1) %a.gep 2230 %b = load volatile float, ptr addrspace(1) %b.gep 2231 %mul = fmul float %a, %b 2232 %fneg = fneg float %mul 2233 %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0) 2234 %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0) 2235 store volatile float %intrp0, ptr addrspace(1) %out.gep 2236 store volatile float %intrp1, ptr addrspace(1) %out.gep 2237 ret void 2238} 2239 2240; -------------------------------------------------------------------------------- 2241; CopyToReg tests 2242; -------------------------------------------------------------------------------- 2243 2244; GCN-LABEL: {{^}}v_fneg_copytoreg_f32: 2245; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2246; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2247; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2248; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[A]], [[B]] 2249; GCN: s_cbranch_scc0 2250 2251; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2252; GCN: s_endpgm 2253 2254; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]] 2255; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[XOR]], [[C]] 2256; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2257 2258define amdgpu_kernel void @v_fneg_copytoreg_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 { 2259 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2260 %tid.ext = sext i32 %tid to i64 2261 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2262 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2263 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2264 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2265 %a = load volatile float, ptr addrspace(1) %a.gep 2266 %b = load volatile float, ptr addrspace(1) %b.gep 2267 %c = load volatile float, ptr addrspace(1) %c.gep 2268 %mul = fmul float %a, %b 2269 %fneg = fneg float %mul 2270 %cmp0 = icmp eq i32 %d, 0 2271 br i1 %cmp0, label %if, label %endif 2272 2273if: 2274 %mul1 = fmul float %fneg, %c 2275 store volatile float %mul1, ptr addrspace(1) %out.gep 2276 br label %endif 2277 2278endif: 2279 store volatile float %mul, ptr addrspace(1) %out.gep 2280 ret void 2281} 2282 2283; -------------------------------------------------------------------------------- 2284; inlineasm tests 2285; -------------------------------------------------------------------------------- 2286 2287; Can't fold into use, so should fold into source 2288; GCN-LABEL: {{^}}v_fneg_inlineasm_f32: 2289; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2290; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2291; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]] 2292; GCN: ; use [[MUL]] 2293; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2294define amdgpu_kernel void @v_fneg_inlineasm_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 { 2295 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2296 %tid.ext = sext i32 %tid to i64 2297 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2298 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2299 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2300 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2301 %a = load volatile float, ptr addrspace(1) %a.gep 2302 %b = load volatile float, ptr addrspace(1) %b.gep 2303 %c = load volatile float, ptr addrspace(1) %c.gep 2304 %mul = fmul float %a, %b 2305 %fneg = fneg float %mul 2306 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2307 store volatile float %fneg, ptr addrspace(1) %out.gep 2308 ret void 2309} 2310 2311; -------------------------------------------------------------------------------- 2312; inlineasm tests 2313; -------------------------------------------------------------------------------- 2314 2315; Can't fold into use, so should fold into source 2316; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32: 2317; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2318; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2319; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[A]], [[B]] 2320; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]] 2321; GCN: ; use [[NEG]] 2322; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] 2323define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, i32 %d) #0 { 2324 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2325 %tid.ext = sext i32 %tid to i64 2326 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2327 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2328 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2329 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2330 %a = load volatile float, ptr addrspace(1) %a.gep 2331 %b = load volatile float, ptr addrspace(1) %b.gep 2332 %c = load volatile float, ptr addrspace(1) %c.gep 2333 %mul = fmul float %a, %b 2334 %fneg = fneg float %mul 2335 call void asm sideeffect "; use $0", "v"(float %fneg) #0 2336 store volatile float %mul, ptr addrspace(1) %out.gep 2337 ret void 2338} 2339 2340; -------------------------------------------------------------------------------- 2341; code size regression tests 2342; -------------------------------------------------------------------------------- 2343 2344; There are multiple users of the fneg that must use a VOP3 2345; instruction, so there is no penalty 2346; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32: 2347; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2348; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2349; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2350 2351; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]] 2352; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0 2353 2354; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2355; GCN-NEXT: s_waitcnt vmcnt(0) 2356; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA1]] 2357; GCN-NEXT: s_waitcnt vmcnt(0) 2358define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 2359 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2360 %tid.ext = sext i32 %tid to i64 2361 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2362 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2363 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2364 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2365 %a = load volatile float, ptr addrspace(1) %a.gep 2366 %b = load volatile float, ptr addrspace(1) %b.gep 2367 %c = load volatile float, ptr addrspace(1) %c.gep 2368 2369 %fneg.a = fneg float %a 2370 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c) 2371 %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0) 2372 2373 store volatile float %fma0, ptr addrspace(1) %out 2374 store volatile float %fma1, ptr addrspace(1) %out 2375 ret void 2376} 2377 2378; There are multiple users, but both require using a larger encoding 2379; for the modifier. 2380 2381; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32: 2382; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2383; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2384; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2385 2386; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]] 2387; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2388; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2389; GCN-NEXT: s_waitcnt vmcnt(0) 2390; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2391; GCN-NEXT: s_waitcnt vmcnt(0) 2392define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 2393 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2394 %tid.ext = sext i32 %tid to i64 2395 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2396 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2397 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2398 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2399 %a = load volatile float, ptr addrspace(1) %a.gep 2400 %b = load volatile float, ptr addrspace(1) %b.gep 2401 %c = load volatile float, ptr addrspace(1) %c.gep 2402 2403 %fneg.a = fneg float %a 2404 %mul0 = fmul float %fneg.a, %b 2405 %mul1 = fmul float %fneg.a, %c 2406 2407 store volatile float %mul0, ptr addrspace(1) %out 2408 store volatile float %mul1, ptr addrspace(1) %out 2409 ret void 2410} 2411 2412; One user is VOP3 so has no cost to folding the modifier, the other does. 2413; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32: 2414; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2415; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2416; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2417 2418; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0 2419; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]] 2420 2421; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2422; GCN-NEXT: s_waitcnt vmcnt(0) 2423; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2424; GCN-NEXT: s_waitcnt vmcnt(0) 2425define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr) #0 { 2426 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2427 %tid.ext = sext i32 %tid to i64 2428 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2429 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2430 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2431 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2432 %a = load volatile float, ptr addrspace(1) %a.gep 2433 %b = load volatile float, ptr addrspace(1) %b.gep 2434 %c = load volatile float, ptr addrspace(1) %c.gep 2435 2436 %fneg.a = fneg float %a 2437 %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0) 2438 %mul1 = fmul float %fneg.a, %c 2439 2440 store volatile float %fma0, ptr addrspace(1) %out 2441 store volatile float %mul1, ptr addrspace(1) %out 2442 ret void 2443} 2444 2445; The use of the fneg requires a code size increase, but folding into 2446; the source does not 2447 2448; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32: 2449; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2450; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2451; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2452; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2453 2454; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0 2455; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]] 2456; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]] 2457 2458; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0 2459; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[FMA0]], [[C]] 2460; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[FMA0]], [[D]] 2461 2462; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2463; GCN-NEXT: s_waitcnt vmcnt(0) 2464; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL2]] 2465; GCN-NEXT: s_waitcnt vmcnt(0) 2466define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 { 2467 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2468 %tid.ext = sext i32 %tid to i64 2469 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2470 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2471 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2472 %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext 2473 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2474 %a = load volatile float, ptr addrspace(1) %a.gep 2475 %b = load volatile float, ptr addrspace(1) %b.gep 2476 %c = load volatile float, ptr addrspace(1) %c.gep 2477 %d = load volatile float, ptr addrspace(1) %d.gep 2478 2479 %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0) 2480 %fneg.fma0 = fneg float %fma0 2481 %mul1 = fmul float %fneg.fma0, %c 2482 %mul2 = fmul float %fneg.fma0, %d 2483 2484 store volatile float %mul1, ptr addrspace(1) %out 2485 store volatile float %mul2, ptr addrspace(1) %out 2486 ret void 2487} 2488 2489; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: 2490; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] 2491; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] 2492; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] 2493; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] 2494 2495; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 2496; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] 2497; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] 2498 2499; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL0]] 2500; GCN-NEXT: s_waitcnt vmcnt(0) 2501; GCN-NEXT: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2502; GCN-NEXT: s_waitcnt vmcnt(0) 2503define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 { 2504 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2505 %tid.ext = sext i32 %tid to i64 2506 %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext 2507 %b.gep = getelementptr inbounds double, ptr addrspace(1) %b.ptr, i64 %tid.ext 2508 %c.gep = getelementptr inbounds double, ptr addrspace(1) %c.ptr, i64 %tid.ext 2509 %d.gep = getelementptr inbounds double, ptr addrspace(1) %d.ptr, i64 %tid.ext 2510 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 2511 %a = load volatile double, ptr addrspace(1) %a.gep 2512 %b = load volatile double, ptr addrspace(1) %b.gep 2513 %c = load volatile double, ptr addrspace(1) %c.gep 2514 %d = load volatile double, ptr addrspace(1) %d.gep 2515 2516 %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0) 2517 %fneg.fma0 = fsub double -0.0, %fma0 2518 %mul1 = fmul double %fneg.fma0, %c 2519 %mul2 = fmul double %fneg.fma0, %d 2520 2521 store volatile double %mul1, ptr addrspace(1) %out 2522 store volatile double %mul2, ptr addrspace(1) %out 2523 ret void 2524} 2525 2526; %trunc.a has one fneg use, but it requires a code size increase and 2527; %the fneg can instead be folded for free into the fma. 2528 2529; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32: 2530; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2531; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2532; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2533; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2534; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2535; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2536define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 { 2537 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2538 %tid.ext = sext i32 %tid to i64 2539 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2540 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2541 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2542 %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext 2543 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2544 %a = load volatile float, ptr addrspace(1) %a.gep 2545 %b = load volatile float, ptr addrspace(1) %b.gep 2546 %c = load volatile float, ptr addrspace(1) %c.gep 2547 %d = load volatile float, ptr addrspace(1) %d.gep 2548 2549 %trunc.a = call float @llvm.trunc.f32(float %a) 2550 %trunc.fneg.a = fneg float %trunc.a 2551 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2552 store volatile float %fma0, ptr addrspace(1) %out 2553 ret void 2554} 2555 2556; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src: 2557; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] 2558; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] 2559; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] 2560; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]] 2561; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]] 2562; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]] 2563; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[TRUNC_A]], [[D]] 2564; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FMA0]] 2565; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MUL1]] 2566define amdgpu_kernel void @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr, ptr addrspace(1) %c.ptr, ptr addrspace(1) %d.ptr) #0 { 2567 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2568 %tid.ext = sext i32 %tid to i64 2569 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2570 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 2571 %c.gep = getelementptr inbounds float, ptr addrspace(1) %c.ptr, i64 %tid.ext 2572 %d.gep = getelementptr inbounds float, ptr addrspace(1) %d.ptr, i64 %tid.ext 2573 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2574 %a = load volatile float, ptr addrspace(1) %a.gep 2575 %b = load volatile float, ptr addrspace(1) %b.gep 2576 %c = load volatile float, ptr addrspace(1) %c.gep 2577 %d = load volatile float, ptr addrspace(1) %d.gep 2578 2579 %trunc.a = call float @llvm.trunc.f32(float %a) 2580 %trunc.fneg.a = fneg float %trunc.a 2581 %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c) 2582 %mul1 = fmul float %trunc.a, %d 2583 store volatile float %fma0, ptr addrspace(1) %out 2584 store volatile float %mul1, ptr addrspace(1) %out 2585 ret void 2586} 2587 2588; The AMDGPU combine to pull fneg into the FMA operands was being 2589; undone by the generic combine to pull the fneg out of the fma if 2590; !isFNegFree. We were reporting false for v2f32 even though it will 2591; be split into f32 where it will be free. 2592; GCN-LABEL: {{^}}fneg_fma_fneg_dagcombine_loop: 2593; GCN: s_brev_b32 [[NEGZERO:s[0-9]+]], 1{{$}} 2594; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], v2, -v4, [[NEGZERO]] 2595; GCN-DAG: v_fma_f32 [[FMA1:v[0-9]+]], v3, -v5, [[NEGZERO]] 2596; GCN-DAG: v_sub_f32_e32 [[SUB0:v[0-9]+]], [[FMA0]], v0 2597; GCN-DAG: v_sub_f32_e32 [[SUB1:v[0-9]+]], [[FMA1]], v1 2598; GCN-DAG: v_mul_f32_e32 v0, [[SUB0]], v4 2599; GCN-DAG: v_mul_f32_e32 v1, [[SUB1]], v5 2600; GCN: s_setpc_b64 2601define <2 x float> @fneg_fma_fneg_dagcombine_loop(<2 x float> %arg, <2 x float> %arg1, <2 x float> %arg2) #0 { 2602bb: 2603 %i3 = call fast <2 x float> @llvm.fma.v2f32(<2 x float> %arg1, <2 x float> %arg2, <2 x float> zeroinitializer) 2604 %i4 = fadd fast <2 x float> %i3, %arg 2605 %i5 = fneg <2 x float> %i4 2606 %i6 = fmul fast <2 x float> %i5, %arg2 2607 ret <2 x float> %i6 2608} 2609 2610; This expects denormal flushing, so can't turn this fmul into fneg 2611; GCN-LABEL: {{^}}nnan_fmul_neg1_to_fneg: 2612; GCN: s_waitcnt 2613; GCN-NEXT: v_mul_f32_e64 v0, -v0, v1 2614define float @nnan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2615 %mul = fmul float %x, -1.0 2616 %add = fmul nnan float %mul, %y 2617 ret float %add 2618} 2619 2620; It's legal to turn this fmul into an fneg since denormals are 2621; preserved and we know an snan can't happen from the flag. 2622; GCN-LABEL: {{^}}denormal_fmul_neg1_to_fneg: 2623; GCN: v_mul_f32_e64 v0, -v0, v1 2624; GCN-NEXT: s_setpc_b64 2625define float @denormal_fmul_neg1_to_fneg(float %x, float %y) { 2626 %mul = fmul nnan float %x, -1.0 2627 %add = fmul float %mul, %y 2628 ret float %add 2629} 2630 2631; know the source can't be an snan 2632; GCN-LABEL: {{^}}denorm_snan_fmul_neg1_to_fneg: 2633; GCN: s_waitcnt 2634; GCN-NEXT: v_mul_f32_e64 [[TMP:v[0-9]+]], v0, -v0 2635; GCN-NEXT: v_mul_f32_e32 v0, [[TMP]], v1 2636; GCN-NEXT: s_setpc_b64 2637define float @denorm_snan_fmul_neg1_to_fneg(float %x, float %y) { 2638 %canonical = fmul float %x, %x 2639 %mul = fmul float %canonical, -1.0 2640 %add = fmul float %mul, %y 2641 ret float %add 2642} 2643 2644; GCN-LABEL: {{^}}flush_snan_fmul_neg1_to_fneg: 2645; GCN: s_waitcnt 2646; GCN-NEXT: v_mul_f32_e32 [[TMP:v[0-9]+]], 1.0, v0 2647; GCN-NEXT: v_mul_f32_e64 v0, -[[TMP]], v1 2648define float @flush_snan_fmul_neg1_to_fneg(float %x, float %y) #0 { 2649 %quiet = call float @llvm.canonicalize.f32(float %x) 2650 %mul = fmul float %quiet, -1.0 2651 %add = fmul float %mul, %y 2652 ret float %add 2653} 2654 2655; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f32: 2656; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 2657; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 2658; GCN-NEXT: v_sub_f32_e32 v0, v3, v0 2659; GCN-NEXT: s_setpc_b64 2660define float @fadd_select_fneg_fneg_f32(i32 %arg0, float %x, float %y, float %z) { 2661 %cmp = icmp eq i32 %arg0, 0 2662 %neg.x = fneg float %x 2663 %neg.y = fneg float %y 2664 %select = select i1 %cmp, float %neg.x, float %neg.y 2665 %add = fadd float %select, %z 2666 ret float %add 2667} 2668 2669; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f64: 2670; GCN: v_cmp_eq_u32_e32 vcc, 0, v0 2671; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc 2672; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 2673; GCN-NEXT: v_add_f64 v[0:1], v[5:6], -v[1:2] 2674; GCN-NEXT: s_setpc_b64 2675define double @fadd_select_fneg_fneg_f64(i32 %arg0, double %x, double %y, double %z) { 2676 %cmp = icmp eq i32 %arg0, 0 2677 %neg.x = fneg double %x 2678 %neg.y = fneg double %y 2679 %select = select i1 %cmp, double %neg.x, double %neg.y 2680 %add = fadd double %select, %z 2681 ret double %add 2682} 2683 2684; GCN-LABEL: {{^}}fadd_select_fneg_fneg_f16: 2685; SI: v_cvt_f16_f32 2686; SI: v_cvt_f16_f32 2687; SI: v_cvt_f16_f32 2688; SI: v_cmp_eq_u32 2689; SI: v_cvt_f32_f16 2690; SI: v_cvt_f32_f16 2691; SI: v_cvt_f32_f16 2692; SI: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc 2693; SI-NEXT: v_sub_f32_e32 2694; SI-NEXT: s_setpc_b64 2695 2696; VI: v_cmp_eq_u32_e32 vcc, 0, v0 2697; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 2698; VI-NEXT: v_sub_f16_e32 v0, v3, v0 2699; VI-NEXT: s_setpc_b64 2700define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) { 2701 %cmp = icmp eq i32 %arg0, 0 2702 %neg.x = fneg half %x 2703 %neg.y = fneg half %y 2704 %select = select i1 %cmp, half %neg.x, half %neg.y 2705 %add = fadd half %select, %z 2706 ret half %add 2707} 2708 2709; FIXME: Terrible code for SI 2710; GCN-LABEL: {{^}}fadd_select_fneg_fneg_v2f16: 2711; SI: v_cvt_f16_f32 2712; SI: v_cvt_f16_f32 2713; SI: v_cvt_f16_f32 2714; SI: v_cvt_f16_f32 2715; SI: v_cmp_eq_u32 2716; SI: v_lshlrev_b32_e32 2717; SI: v_or_b32_e32 2718; SI: v_cndmask_b32 2719; SI: v_lshrrev_b32 2720; SI: v_cvt_f32_f16 2721; SI: v_cvt_f32_f16 2722; SI: v_cvt_f32_f16 2723; SI: v_cvt_f32_f16 2724; SI: v_sub_f32 2725; SI: v_sub_f32 2726 2727; VI: v_cmp_eq_u32_e32 vcc, 0, v0 2728; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 2729; VI-NEXT: v_sub_f16_sdwa v1, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2730; VI-NEXT: v_sub_f16_e32 v0, v3, v0 2731; VI-NEXT: v_or_b32_e32 v0, v0, v1 2732define <2 x half> @fadd_select_fneg_fneg_v2f16(i32 %arg0, <2 x half> %x, <2 x half> %y, <2 x half> %z) { 2733 %cmp = icmp eq i32 %arg0, 0 2734 %neg.x = fneg <2 x half> %x 2735 %neg.y = fneg <2 x half> %y 2736 %select = select i1 %cmp, <2 x half> %neg.x, <2 x half> %neg.y 2737 %add = fadd <2 x half> %select, %z 2738 ret <2 x half> %add 2739} 2740 2741; FIXME: This fneg should fold into select 2742; GCN-LABEL: {{^}}v_fneg_select_f32: 2743; GCN: s_waitcnt 2744; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2745; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 2746; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2747; GCN-NEXT: s_setpc_b64 2748define float @v_fneg_select_f32(i32 %arg0, float %a, float %b, float %c) { 2749 %cond = icmp eq i32 %arg0, 0 2750 %select = select i1 %cond, float %a, float %b 2751 %fneg = fneg float %select 2752 ret float %fneg 2753} 2754 2755; FIXME: This fneg should fold into select 2756; GCN-LABEL: {{^}}v_fneg_select_2_f32: 2757; GCN: s_waitcnt 2758; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1 2759; GCN-NSZ-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2 2760; GCN-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2761; GCN-NSZ-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc 2762; GCN-NSZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2763 2764; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD2:v[0-9]+]], 2.0, v1 2765; GCN-SAFE-NEXT: v_add_f32_e32 [[ADD4:v[0-9]+]], 4.0, v2 2766; GCN-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 2767; GCN-SAFE-NEXT: v_cndmask_b32_e32 v0, [[ADD4]], [[ADD2]], vcc 2768; GCN-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2769 2770; GCN-NEXT: s_setpc_b64 2771define float @v_fneg_select_2_f32(i32 %arg0, float %a, float %b, float %c) { 2772 %cond = icmp eq i32 %arg0, 0 2773 %add.0 = fadd float %a, 2.0 2774 %add.1 = fadd float %b, 4.0 2775 %select = select i1 %cond, float %add.0, float %add.1 2776 %neg.select = fneg float %select 2777 ret float %neg.select 2778} 2779 2780; GCN-LABEL: {{^}}v_fneg_posk_select_f32: 2781; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 2782; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc 2783; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2784define amdgpu_kernel void @v_fneg_posk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { 2785 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2786 %tid.ext = sext i32 %tid to i64 2787 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2788 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2789 %a = load volatile float, ptr addrspace(1) %a.gep 2790 %cond = icmp eq i32 %tid, 0 2791 %select = select i1 %cond, float 4.0, float %a 2792 %fneg = fneg float %select 2793 store float %fneg, ptr addrspace(1) %out.gep 2794 ret void 2795} 2796 2797; GCN-LABEL: {{^}}v_fneg_negk_select_f32: 2798; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 2799; GCN-NEXT: v_cndmask_b32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}, vcc 2800; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 2801define amdgpu_kernel void @v_fneg_negk_select_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) { 2802 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2803 %tid.ext = sext i32 %tid to i64 2804 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 2805 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2806 %a = load volatile float, ptr addrspace(1) %a.gep 2807 %cond = icmp eq i32 %tid, 0 2808 %select = select i1 %cond, float -4.0, float %a 2809 %fneg = fneg float %select 2810 store float %fneg, ptr addrspace(1) %out.gep 2811 ret void 2812} 2813 2814declare i32 @llvm.amdgcn.workitem.id.x() #1 2815declare float @llvm.fma.f32(float, float, float) #1 2816declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 2817declare float @llvm.fmuladd.f32(float, float, float) #1 2818declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 2819declare float @llvm.sin.f32(float) #1 2820declare float @llvm.trunc.f32(float) #1 2821declare float @llvm.round.f32(float) #1 2822declare float @llvm.rint.f32(float) #1 2823declare float @llvm.nearbyint.f32(float) #1 2824declare float @llvm.canonicalize.f32(float) #1 2825declare float @llvm.minnum.f32(float, float) #1 2826declare float @llvm.maxnum.f32(float, float) #1 2827declare half @llvm.minnum.f16(half, half) #1 2828declare double @llvm.minnum.f64(double, double) #1 2829declare double @llvm.fma.f64(double, double, double) #1 2830 2831declare float @llvm.amdgcn.sin.f32(float) #1 2832declare float @llvm.amdgcn.rcp.f32(float) #1 2833declare float @llvm.amdgcn.rcp.legacy(float) #1 2834declare float @llvm.amdgcn.fmul.legacy(float, float) #1 2835declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 2836declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 2837 2838attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 2839attributes #1 = { nounwind readnone } 2840attributes #2 = { nounwind "unsafe-fp-math"="true" } 2841attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } 2842