1; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-FLUSH %s 5 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-STRICT,GFX9-DENORM %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s 9; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-DENORM-CONTRACT,GFX9-DENORM %s 10 11declare i32 @llvm.amdgcn.workitem.id.x() #1 12declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 13declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 14 15; GCN-LABEL: {{^}}fmuladd_v2f16: 16; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 17; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 18 19; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 20define amdgpu_kernel void @fmuladd_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 21 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { 22 %r0 = load <2 x half>, ptr addrspace(1) %in1 23 %r1 = load <2 x half>, ptr addrspace(1) %in2 24 %r2 = load <2 x half>, ptr addrspace(1) %in3 25 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) 26 store <2 x half> %r3, ptr addrspace(1) %out 27 ret void 28} 29 30; GCN-LABEL: {{^}}fmul_fadd_v2f16: 31; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 32; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 33 34; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 35define amdgpu_kernel void @fmul_fadd_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 36 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { 37 %r0 = load <2 x half>, ptr addrspace(1) %in1 38 %r1 = load <2 x half>, ptr addrspace(1) %in2 39 %r2 = load <2 x half>, ptr addrspace(1) %in3 40 %r3 = fmul <2 x half> %r0, %r1 41 %r4 = fadd <2 x half> %r3, %r2 42 store <2 x half> %r4, ptr addrspace(1) %out 43 ret void 44} 45 46; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16: 47; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 48; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 49 50; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} 51define amdgpu_kernel void @fmul_fadd_contract_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in1, 52 ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { 53 %r0 = load <2 x half>, ptr addrspace(1) %in1 54 %r1 = load <2 x half>, ptr addrspace(1) %in2 55 %r2 = load <2 x half>, ptr addrspace(1) %in3 56 %r3 = fmul contract <2 x half> %r0, %r1 57 %r4 = fadd contract <2 x half> %r3, %r2 58 store <2 x half> %r4, ptr addrspace(1) %out 59 ret void 60} 61 62 63; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: 64; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 65; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 66; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 67; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 68 69; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 70 71; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 72; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 73define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 74 %tid = call i32 @llvm.amdgcn.workitem.id.x() 75 %gep.0 = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 76 %gep.1 = getelementptr <2 x half>, ptr addrspace(1) %gep.0, i32 1 77 %gep.out = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 78 79 %r1 = load volatile <2 x half>, ptr addrspace(1) %gep.0 80 %r2 = load volatile <2 x half>, ptr addrspace(1) %gep.1 81 82 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2) 83 store <2 x half> %r3, ptr addrspace(1) %gep.out 84 ret void 85} 86 87; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16: 88; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 89; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 90; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 91; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 92 93; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 94 95; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 96; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} 97define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 98 %tid = call i32 @llvm.amdgcn.workitem.id.x() 99 %gep.0 = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 100 %gep.1 = getelementptr <2 x half>, ptr addrspace(1) %gep.0, i32 1 101 %gep.out = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 102 103 %r1 = load volatile <2 x half>, ptr addrspace(1) %gep.0 104 %r2 = load volatile <2 x half>, ptr addrspace(1) %gep.1 105 106 %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2) 107 store <2 x half> %r3, ptr addrspace(1) %gep.out 108 ret void 109} 110 111; GCN-LABEL: {{^}}fadd_a_a_b_v2f16: 112; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], 113; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], 114; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 115; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 116 117; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] 118; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] 119 120; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] 121 122; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] 123define amdgpu_kernel void @fadd_a_a_b_v2f16(ptr addrspace(1) %out, 124 ptr addrspace(1) %in1, 125 ptr addrspace(1) %in2) #0 { 126 %tid = call i32 @llvm.amdgcn.workitem.id.x() 127 %gep.0 = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 128 %gep.1 = getelementptr <2 x half>, ptr addrspace(1) %gep.0, i32 1 129 %gep.out = getelementptr <2 x half>, ptr addrspace(1) %out, i32 %tid 130 131 %r0 = load volatile <2 x half>, ptr addrspace(1) %gep.0 132 %r1 = load volatile <2 x half>, ptr addrspace(1) %gep.1 133 134 %add.0 = fadd <2 x half> %r0, %r0 135 %add.1 = fadd <2 x half> %add.0, %r1 136 store <2 x half> %add.1, ptr addrspace(1) %gep.out 137 ret void 138} 139 140attributes #0 = { nounwind } 141attributes #1 = { nounwind readnone } 142