1; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s 2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s 3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s 4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s 5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s 6 7; GCN-LABEL: {{^}}fadd_v2_vv: 8; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 9; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 10define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) { 11 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 12 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 13 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 14 %add = fadd <2 x float> %load, %load 15 store <2 x float> %add, ptr addrspace(1) %gep, align 8 16 ret void 17} 18 19; GCN-LABEL: {{^}}fadd_v2_vs: 20; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 21; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 22define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { 23 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 24 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 25 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 26 %add = fadd <2 x float> %load, %x 27 store <2 x float> %add, ptr addrspace(1) %gep, align 8 28 ret void 29} 30 31; GCN-LABEL: {{^}}fadd_v4_vs: 32; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 33; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 34define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { 35 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 36 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id 37 %load = load <4 x float>, ptr addrspace(1) %gep, align 16 38 %add = fadd <4 x float> %load, %x 39 store <4 x float> %add, ptr addrspace(1) %gep, align 16 40 ret void 41} 42 43; GCN-LABEL: {{^}}fadd_v32_vs: 44; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 45; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 46define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { 47 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 48 %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id 49 %load = load <32 x float>, ptr addrspace(1) %gep, align 128 50 %add = fadd <32 x float> %load, %x 51 store <32 x float> %add, ptr addrspace(1) %gep, align 128 52 ret void 53} 54 55; FIXME: GISel does not use op_sel for splat constants. 56 57; GCN-LABEL: {{^}}fadd_v2_v_imm: 58; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 59; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} 60; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 61; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} 62define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) { 63 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 64 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 65 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 66 %add = fadd <2 x float> %load, <float 100.0, float 100.0> 67 store <2 x float> %add, ptr addrspace(1) %gep, align 8 68 ret void 69} 70 71; GCN-LABEL: {{^}}fadd_v2_v_v_splat: 72; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 73; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} 74; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} 75define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) { 76 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 77 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 78 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 79 %fid = bitcast i32 %id to float 80 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 81 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 82 %add = fadd <2 x float> %load, %k 83 store <2 x float> %add, ptr addrspace(1) %gep, align 8 84 ret void 85} 86 87; GCN-LABEL: {{^}}fadd_v2_v_lit_splat: 88; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 89; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}} 90; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 91define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) { 92 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 93 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 94 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 95 %add = fadd <2 x float> %load, <float 1.0, float 1.0> 96 store <2 x float> %add, ptr addrspace(1) %gep, align 8 97 ret void 98} 99 100; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0: 101; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 102; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 103; PACKED-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000 104; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]] 105define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) { 106 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 107 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 108 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 109 %add = fadd <2 x float> %load, <float 1.0, float 0.0> 110 store <2 x float> %add, ptr addrspace(1) %gep, align 8 111 ret void 112} 113 114; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0: 115; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 116; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 117; PACKED-DAG: s_mov_b32 s[[LO:[0-9]+]], 0 118; PACKED-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0 119; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}} 120define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { 121 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 122 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 123 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 124 %add = fadd <2 x float> %load, <float 0.0, float 1.0> 125 store <2 x float> %add, ptr addrspace(1) %gep, align 8 126 ret void 127} 128 129; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit: 130; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} 131; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} 132; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 1.0 133; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 2.0 134; PACKED: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 135define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { 136 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 137 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 138 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 139 %add = fadd <2 x float> %load, <float 1.0, float 2.0> 140 store <2 x float> %add, ptr addrspace(1) %gep, align 8 141 ret void 142} 143 144; FIXME: Fold fneg into v_pk_add_f32 with Global ISel. 145 146; GCN-LABEL: {{^}}fadd_v2_v_fneg: 147; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 148; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 149; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 150define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) { 151 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 152 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 153 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 154 %fneg = fsub float -0.0, %x 155 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 156 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 157 %add = fadd <2 x float> %load, %k 158 store <2 x float> %add, ptr addrspace(1) %gep, align 8 159 ret void 160} 161 162; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo: 163; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 164; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 165; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}} 166; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 167define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) { 168 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 169 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 170 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 171 %fneg = fsub float -0.0, %x 172 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 173 %k = insertelement <2 x float> %tmp1, float %x, i64 1 174 %add = fadd <2 x float> %load, %k 175 store <2 x float> %add, ptr addrspace(1) %gep, align 8 176 ret void 177} 178 179; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi: 180; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 181; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 182; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} 183; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 184define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) { 185 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 186 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 187 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 188 %fneg = fsub float -0.0, %x 189 %tmp1 = insertelement <2 x float> undef, float %x, i64 0 190 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 191 %add = fadd <2 x float> %load, %k 192 store <2 x float> %add, ptr addrspace(1) %gep, align 8 193 ret void 194} 195 196; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2: 197; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 198; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 199; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}} 200; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 201define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, float %y) { 202 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 203 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 204 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 205 %fneg = fsub float -0.0, %x 206 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 207 %k = insertelement <2 x float> %tmp1, float %y, i64 1 208 %add = fadd <2 x float> %load, %k 209 store <2 x float> %add, ptr addrspace(1) %gep, align 8 210 ret void 211} 212 213; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2: 214; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 215; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 216; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}} 217; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 218define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, float %y) { 219 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 220 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 221 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 222 %fneg = fsub float -0.0, %x 223 %tmp1 = insertelement <2 x float> undef, float %y, i64 0 224 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 225 %add = fadd <2 x float> %load, %k 226 store <2 x float> %add, ptr addrspace(1) %gep, align 8 227 ret void 228} 229 230; GCN-LABEL: {{^}}fmul_v2_vv: 231; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 232; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 233define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) { 234 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 235 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 236 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 237 %mul = fmul <2 x float> %load, %load 238 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 239 ret void 240} 241 242; GCN-LABEL: {{^}}fmul_v2_vs: 243; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 244; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 245define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { 246 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 247 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 248 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 249 %mul = fmul <2 x float> %load, %x 250 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 251 ret void 252} 253 254; GCN-LABEL: {{^}}fmul_v4_vs: 255; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 256; PACKED-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 257define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { 258 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 259 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id 260 %load = load <4 x float>, ptr addrspace(1) %gep, align 16 261 %mul = fmul <4 x float> %load, %x 262 store <4 x float> %mul, ptr addrspace(1) %gep, align 16 263 ret void 264} 265 266; GCN-LABEL: {{^}}fmul_v32_vs: 267; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 268; PACKED-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 269define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { 270 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 271 %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id 272 %load = load <32 x float>, ptr addrspace(1) %gep, align 128 273 %mul = fmul <32 x float> %load, %x 274 store <32 x float> %mul, ptr addrspace(1) %gep, align 128 275 ret void 276} 277 278; GCN-LABEL: {{^}}fmul_v2_v_imm: 279; PACKED: s_mov_b32 s[[K:[0-9]+]], 0x42c80000 280; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}} 281; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 282; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}} 283define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) { 284 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 285 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 286 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 287 %mul = fmul <2 x float> %load, <float 100.0, float 100.0> 288 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 289 ret void 290} 291 292; GCN-LABEL: {{^}}fmul_v2_v_v_splat: 293; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0 294; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}} 295; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}} 296define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) { 297 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 298 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 299 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 300 %fid = bitcast i32 %id to float 301 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 302 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 303 %mul = fmul <2 x float> %load, %k 304 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 305 ret void 306} 307 308; GCN-LABEL: {{^}}fmul_v2_v_lit_splat: 309; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} 310; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}} 311; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 312define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) { 313 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 314 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 315 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 316 %mul = fmul <2 x float> %load, <float 4.0, float 4.0> 317 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 318 ret void 319} 320 321; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit: 322; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} 323; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}} 324; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 325; PACKED-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 326; PACKED: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 327define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { 328 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 329 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 330 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 331 %mul = fmul <2 x float> %load, <float 4.0, float 3.0> 332 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 333 ret void 334} 335 336; GCN-LABEL: {{^}}fmul_v2_v_fneg: 337; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}} 338; PACKED-SDAG: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 339; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 340define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) { 341 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 342 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 343 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 344 %fneg = fsub float -0.0, %x 345 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 346 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 347 %mul = fmul <2 x float> %load, %k 348 store <2 x float> %mul, ptr addrspace(1) %gep, align 8 349 ret void 350} 351 352; GCN-LABEL: {{^}}fma_v2_vv: 353; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 354; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] 355define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) { 356 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 357 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 358 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 359 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load) 360 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 361 ret void 362} 363 364; GCN-LABEL: {{^}}fma_v2_vs: 365; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 366; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 367define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) { 368 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 369 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 370 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 371 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x) 372 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 373 ret void 374} 375 376; GCN-LABEL: {{^}}fma_v4_vs: 377; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 378; PACKED-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 379define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) { 380 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 381 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id 382 %load = load <4 x float>, ptr addrspace(1) %gep, align 16 383 %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x) 384 store <4 x float> %fma, ptr addrspace(1) %gep, align 16 385 ret void 386} 387 388; GCN-LABEL: {{^}}fma_v32_vs: 389; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} 390; PACKED-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 391define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { 392 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 393 %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id 394 %load = load <32 x float>, ptr addrspace(1) %gep, align 128 395 %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x) 396 store <32 x float> %fma, ptr addrspace(1) %gep, align 128 397 ret void 398} 399 400; GCN-LABEL: {{^}}fma_v2_v_imm: 401; GCN-DAG: s_mov_b32 s[[K1:[0-9]+]], 0x42c80000 402; GFX900-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 403; PACKED-SDAG-DAG: v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000 404; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]] 405; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}} 406; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 407define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) { 408 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 409 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 410 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 411 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>) 412 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 413 ret void 414} 415 416; GCN-LABEL: {{^}}fma_v2_v_v_splat: 417; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0 418; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}} 419; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1]{{$}} 420define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) { 421 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 422 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 423 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 424 %fid = bitcast i32 %id to float 425 %tmp1 = insertelement <2 x float> undef, float %fid, i64 0 426 %k = insertelement <2 x float> %tmp1, float %fid, i64 1 427 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) 428 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 429 ret void 430} 431 432; GCN-LABEL: {{^}}fma_v2_v_lit_splat: 433; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 434; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}} 435; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 436define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) { 437 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 438 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 439 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 440 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>) 441 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 442 ret void 443} 444 445; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit: 446; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000 447; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0 448; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0 449; PACKED-SDAG-DAG: s_mov_b32 s{{[0-9]+}}, 4.0 450; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0 451; PACKED-SDAG-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0 452; PACKED: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 453define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { 454 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 455 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 456 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 457 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>) 458 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 459 ret void 460} 461 462; GCN-LABEL: {{^}}fma_v2_v_fneg: 463; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}} 464; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}} 465; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 466define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) { 467 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 468 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 469 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 470 %fneg = fsub float -0.0, %x 471 %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0 472 %k = insertelement <2 x float> %tmp1, float %fneg, i64 1 473 %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k) 474 store <2 x float> %fma, ptr addrspace(1) %gep, align 8 475 ret void 476} 477 478; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo: 479; GFX900-COUNT-2: v_sub_f32_e32 480; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 481; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 482define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { 483bb: 484 %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4 485 %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4 486 %neg.scalar0 = fsub float -0.0, %scalar0 487 488 %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0 489 %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer 490 491 %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast 492 store <2 x float> %result, ptr addrspace(1) %out, align 4 493 ret void 494} 495 496; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi: 497; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} 498; PACKED-SDAG: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} 499; PACKED-GISEL: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 500define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) { 501bb: 502 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 503 %arg2.gep = getelementptr inbounds float, ptr addrspace(3) %arg2, i32 2 504 505 %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 4 506 %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 4 507 508 %scalar0 = load volatile float, ptr addrspace(3) %arg2, align 4 509 %scalar1 = load volatile float, ptr addrspace(3) %arg2.gep, align 4 510 511 %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0 512 %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1 513 %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2 514 515 %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2) 516 store <2 x float> %result, ptr addrspace(1) %out, align 4 517 ret void 518} 519 520; GCN-LABEL: {{^}}shuffle_add_f32: 521; GFX900-COUNT-2: v_add_f32_e32 522; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}} 523; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 524define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 525bb: 526 %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 527 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 528 %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8 529 %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 530 %result = fadd <2 x float> %vec0, %vec1.swap 531 store <2 x float> %result, ptr addrspace(1) %out, align 8 532 ret void 533} 534 535; GCN-LABEL: {{^}}shuffle_neg_add_f32: 536; GFX900-COUNT-2: v_sub_f32_e32 537; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} 538; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 539define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 { 540bb: 541 %vec0 = load volatile <2 x float>, ptr addrspace(3) %lds, align 8 542 %lds.gep1 = getelementptr inbounds <2 x float>, ptr addrspace(3) %lds, i32 1 543 %f32 = load volatile float, ptr addrspace(3) undef, align 8 544 %vec1 = load volatile <2 x float>, ptr addrspace(3) %lds.gep1, align 8 545 %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1 546 %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0> 547 %result = fadd <2 x float> %vec0, %vec1.neg.swap 548 store <2 x float> %result, ptr addrspace(1) %out, align 8 549 ret void 550} 551 552; GCN-LABEL: {{^}}fadd_fadd_fsub_0: 553; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 554; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 555 556; PACKED-SDAG: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0 557; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} 558 559; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 560; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}} 561define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { 562bb: 563 %i12 = fadd <2 x float> zeroinitializer, %arg 564 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> 565 %i13 = fadd <2 x float> zeroinitializer, %shift8 566 %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2> 567 %i15 = fsub <2 x float> %i14, zeroinitializer 568 store <2 x float> %i15, ptr undef 569 ret void 570} 571 572; GCN-LABEL: {{^}}fadd_fadd_fsub: 573; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 574; GFX900: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 575 576; PACKED-SDAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} 577; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 578 579; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 580; PACKED-GISEL: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 581define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, ptr addrspace(1) %ptr) { 582bb: 583 %i12 = fadd <2 x float> %arg, %arg1 584 %shift8 = shufflevector <2 x float> %i12, <2 x float> poison, <2 x i32> <i32 1, i32 poison> 585 %i13 = fadd <2 x float> %arg1, %shift8 586 %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2> 587 %i15 = fsub <2 x float> %i14, %arg1 588 store <2 x float> %i15, ptr addrspace(1) %ptr 589 ret void 590} 591 592; GCN-LABEL: {{^}}fadd_shuffle_v4: 593; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} 594; PACKED-SDAG-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]{{$}} 595; PACKED-GISEL-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}} 596define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) { 597bb: 598 %tid = call i32 @llvm.amdgcn.workitem.id.x() 599 %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid 600 %in.1 = load <4 x float>, ptr addrspace(1) %gep 601 %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer 602 %add.1 = fadd <4 x float> %in.1, %shuf 603 store <4 x float> %add.1, ptr addrspace(1) %gep 604 ret void 605} 606 607; GCN-LABEL: {{^}}fneg_v2f32_vec: 608; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} 609; PACKED-SDAG: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}} 610; PACKED-GISEL-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} 611; PACKED-GISEL: v_pk_mul_f32 v[{{[0-9:]+}}], 1.0, v[{{[0-9:]+}}] op_sel_hi:[0,1]{{$}} 612define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { 613 %id = tail call i32 @llvm.amdgcn.workitem.id.x() 614 %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id 615 %load = load <2 x float>, ptr addrspace(1) %gep, align 8 616 %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load 617 store <2 x float> %fneg, ptr addrspace(1) %gep, align 8 618 ret void 619} 620 621; GCN-LABEL: {{^}}fneg_v2f32_scalar: 622; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 623define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x) { 624 %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x 625 store <2 x float> %fneg, ptr addrspace(1) %a, align 8 626 ret void 627} 628 629declare i32 @llvm.amdgcn.workitem.id.x() 630declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) 631declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 632declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>) 633