1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s 5; RUN: not llc -mtriple=r600 -mcpu=redwood < %s 6 7define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) { 8; SI-LABEL: s_fneg_f32: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dword s6, s[4:5], 0xb 11; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s3, 0xf000 13; SI-NEXT: s_mov_b32 s2, -1 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 16; SI-NEXT: v_mov_b32_e32 v0, s4 17; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 18; SI-NEXT: s_endpgm 19; 20; VI-LABEL: s_fneg_f32: 21; VI: ; %bb.0: 22; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 23; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 24; VI-NEXT: s_waitcnt lgkmcnt(0) 25; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 26; VI-NEXT: v_mov_b32_e32 v0, s0 27; VI-NEXT: v_mov_b32_e32 v1, s1 28; VI-NEXT: v_mov_b32_e32 v2, s2 29; VI-NEXT: flat_store_dword v[0:1], v2 30; VI-NEXT: s_endpgm 31; 32; GFX11-LABEL: s_fneg_f32: 33; GFX11: ; %bb.0: 34; GFX11-NEXT: s_clause 0x1 35; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 36; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 37; GFX11-NEXT: s_waitcnt lgkmcnt(0) 38; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 39; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 40; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 41; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 42; GFX11-NEXT: s_endpgm 43 %fneg = fsub float -0.000000e+00, %in 44 store float %fneg, ptr addrspace(1) %out 45 ret void 46} 47 48define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x float> %in) { 49; SI-LABEL: s_fneg_v2f32: 50; SI: ; %bb.0: 51; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 52; SI-NEXT: s_mov_b32 s7, 0xf000 53; SI-NEXT: s_mov_b32 s6, -1 54; SI-NEXT: s_waitcnt lgkmcnt(0) 55; SI-NEXT: s_mov_b32 s4, s0 56; SI-NEXT: s_mov_b32 s5, s1 57; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 58; SI-NEXT: s_xor_b32 s1, s2, 0x80000000 59; SI-NEXT: v_mov_b32_e32 v0, s1 60; SI-NEXT: v_mov_b32_e32 v1, s0 61; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 62; SI-NEXT: s_endpgm 63; 64; VI-LABEL: s_fneg_v2f32: 65; VI: ; %bb.0: 66; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 67; VI-NEXT: s_waitcnt lgkmcnt(0) 68; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 69; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 70; VI-NEXT: v_mov_b32_e32 v3, s1 71; VI-NEXT: v_mov_b32_e32 v0, s2 72; VI-NEXT: v_mov_b32_e32 v1, s3 73; VI-NEXT: v_mov_b32_e32 v2, s0 74; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 75; VI-NEXT: s_endpgm 76; 77; GFX11-LABEL: s_fneg_v2f32: 78; GFX11: ; %bb.0: 79; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 80; GFX11-NEXT: s_waitcnt lgkmcnt(0) 81; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 82; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 83; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 84; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 85; GFX11-NEXT: v_mov_b32_e32 v0, s2 86; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 87; GFX11-NEXT: s_endpgm 88 %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in 89 store <2 x float> %fneg, ptr addrspace(1) %out 90 ret void 91} 92 93define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x float> %in) { 94; SI-LABEL: s_fneg_v4f32: 95; SI: ; %bb.0: 96; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 97; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 98; SI-NEXT: s_mov_b32 s7, 0xf000 99; SI-NEXT: s_mov_b32 s6, -1 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: s_xor_b32 s3, s3, 0x80000000 102; SI-NEXT: s_xor_b32 s2, s2, 0x80000000 103; SI-NEXT: s_xor_b32 s1, s1, 0x80000000 104; SI-NEXT: s_xor_b32 s0, s0, 0x80000000 105; SI-NEXT: v_mov_b32_e32 v0, s0 106; SI-NEXT: v_mov_b32_e32 v1, s1 107; SI-NEXT: v_mov_b32_e32 v2, s2 108; SI-NEXT: v_mov_b32_e32 v3, s3 109; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: s_fneg_v4f32: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 115; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 118; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 119; VI-NEXT: s_xor_b32 s1, s1, 0x80000000 120; VI-NEXT: s_xor_b32 s0, s0, 0x80000000 121; VI-NEXT: v_mov_b32_e32 v4, s4 122; VI-NEXT: v_mov_b32_e32 v0, s0 123; VI-NEXT: v_mov_b32_e32 v1, s1 124; VI-NEXT: v_mov_b32_e32 v2, s2 125; VI-NEXT: v_mov_b32_e32 v3, s3 126; VI-NEXT: v_mov_b32_e32 v5, s5 127; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 128; VI-NEXT: s_endpgm 129; 130; GFX11-LABEL: s_fneg_v4f32: 131; GFX11: ; %bb.0: 132; GFX11-NEXT: s_clause 0x1 133; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 134; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 135; GFX11-NEXT: s_waitcnt lgkmcnt(0) 136; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 137; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 138; GFX11-NEXT: s_xor_b32 s0, s0, 0x80000000 139; GFX11-NEXT: s_xor_b32 s1, s1, 0x80000000 140; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 141; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 142; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 143; GFX11-NEXT: v_mov_b32_e32 v2, s2 144; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] 145; GFX11-NEXT: s_endpgm 146 %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in 147 store <4 x float> %fneg, ptr addrspace(1) %out 148 ret void 149} 150 151define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) { 152; SI-LABEL: fsub0_f32: 153; SI: ; %bb.0: 154; SI-NEXT: s_load_dword s6, s[4:5], 0xb 155; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 156; SI-NEXT: s_mov_b32 s3, 0xf000 157; SI-NEXT: s_mov_b32 s2, -1 158; SI-NEXT: s_waitcnt lgkmcnt(0) 159; SI-NEXT: v_sub_f32_e64 v0, 0, s6 160; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 161; SI-NEXT: s_endpgm 162; 163; VI-LABEL: fsub0_f32: 164; VI: ; %bb.0: 165; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 166; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: v_sub_f32_e64 v2, 0, s2 169; VI-NEXT: v_mov_b32_e32 v0, s0 170; VI-NEXT: v_mov_b32_e32 v1, s1 171; VI-NEXT: flat_store_dword v[0:1], v2 172; VI-NEXT: s_endpgm 173; 174; GFX11-LABEL: fsub0_f32: 175; GFX11: ; %bb.0: 176; GFX11-NEXT: s_clause 0x1 177; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 178; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 179; GFX11-NEXT: v_mov_b32_e32 v0, 0 180; GFX11-NEXT: s_waitcnt lgkmcnt(0) 181; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2 182; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 183; GFX11-NEXT: s_endpgm 184 %bc = bitcast i32 %in to float 185 %fsub = fsub float 0.0, %bc 186 store float %fsub, ptr addrspace(1) %out 187 ret void 188} 189 190define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) { 191; SI-LABEL: fneg_free_f32: 192; SI: ; %bb.0: 193; SI-NEXT: s_load_dword s6, s[4:5], 0xb 194; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 195; SI-NEXT: s_mov_b32 s3, 0xf000 196; SI-NEXT: s_mov_b32 s2, -1 197; SI-NEXT: s_waitcnt lgkmcnt(0) 198; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 199; SI-NEXT: v_mov_b32_e32 v0, s4 200; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 201; SI-NEXT: s_endpgm 202; 203; VI-LABEL: fneg_free_f32: 204; VI: ; %bb.0: 205; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 206; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 207; VI-NEXT: s_waitcnt lgkmcnt(0) 208; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 209; VI-NEXT: v_mov_b32_e32 v0, s0 210; VI-NEXT: v_mov_b32_e32 v1, s1 211; VI-NEXT: v_mov_b32_e32 v2, s2 212; VI-NEXT: flat_store_dword v[0:1], v2 213; VI-NEXT: s_endpgm 214; 215; GFX11-LABEL: fneg_free_f32: 216; GFX11: ; %bb.0: 217; GFX11-NEXT: s_clause 0x1 218; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 219; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 220; GFX11-NEXT: s_waitcnt lgkmcnt(0) 221; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 222; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 223; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 224; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 225; GFX11-NEXT: s_endpgm 226 %bc = bitcast i32 %in to float 227 %fsub = fsub float -0.0, %bc 228 store float %fsub, ptr addrspace(1) %out 229 ret void 230} 231 232define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) { 233; SI-LABEL: fneg_fold_f32: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dword s6, s[4:5], 0xb 236; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 237; SI-NEXT: s_mov_b32 s3, 0xf000 238; SI-NEXT: s_mov_b32 s2, -1 239; SI-NEXT: s_waitcnt lgkmcnt(0) 240; SI-NEXT: v_mul_f32_e64 v0, -s6, s6 241; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 242; SI-NEXT: s_endpgm 243; 244; VI-LABEL: fneg_fold_f32: 245; VI: ; %bb.0: 246; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 247; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 248; VI-NEXT: s_waitcnt lgkmcnt(0) 249; VI-NEXT: v_mul_f32_e64 v2, -s2, s2 250; VI-NEXT: v_mov_b32_e32 v0, s0 251; VI-NEXT: v_mov_b32_e32 v1, s1 252; VI-NEXT: flat_store_dword v[0:1], v2 253; VI-NEXT: s_endpgm 254; 255; GFX11-LABEL: fneg_fold_f32: 256; GFX11: ; %bb.0: 257; GFX11-NEXT: s_clause 0x1 258; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 259; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 260; GFX11-NEXT: v_mov_b32_e32 v0, 0 261; GFX11-NEXT: s_waitcnt lgkmcnt(0) 262; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2 263; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 264; GFX11-NEXT: s_endpgm 265 %fsub = fsub float -0.0, %in 266 %fmul = fmul float %fsub, %in 267 store float %fmul, ptr addrspace(1) %out 268 ret void 269} 270 271; Make sure we turn some integer operations back into fabs 272define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in) { 273; SI-LABEL: bitpreserve_fneg_f32: 274; SI: ; %bb.0: 275; SI-NEXT: s_load_dword s6, s[4:5], 0xb 276; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 277; SI-NEXT: s_mov_b32 s3, 0xf000 278; SI-NEXT: s_mov_b32 s2, -1 279; SI-NEXT: s_waitcnt lgkmcnt(0) 280; SI-NEXT: v_mul_f32_e64 v0, s6, -4.0 281; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 282; SI-NEXT: s_endpgm 283; 284; VI-LABEL: bitpreserve_fneg_f32: 285; VI: ; %bb.0: 286; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 287; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 288; VI-NEXT: s_waitcnt lgkmcnt(0) 289; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0 290; VI-NEXT: v_mov_b32_e32 v0, s0 291; VI-NEXT: v_mov_b32_e32 v1, s1 292; VI-NEXT: flat_store_dword v[0:1], v2 293; VI-NEXT: s_endpgm 294; 295; GFX11-LABEL: bitpreserve_fneg_f32: 296; GFX11: ; %bb.0: 297; GFX11-NEXT: s_clause 0x1 298; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 299; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 300; GFX11-NEXT: v_mov_b32_e32 v0, 0 301; GFX11-NEXT: s_waitcnt lgkmcnt(0) 302; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0 303; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 304; GFX11-NEXT: s_endpgm 305 %in.bc = bitcast float %in to i32 306 %int.abs = xor i32 %in.bc, 2147483648 307 %bc = bitcast i32 %int.abs to float 308 %fadd = fmul float %bc, 4.0 309 store float %fadd, ptr addrspace(1) %out 310 ret void 311} 312 313define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) { 314; SI-LABEL: s_fneg_i32: 315; SI: ; %bb.0: 316; SI-NEXT: s_load_dword s6, s[4:5], 0xb 317; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 318; SI-NEXT: s_mov_b32 s3, 0xf000 319; SI-NEXT: s_mov_b32 s2, -1 320; SI-NEXT: s_waitcnt lgkmcnt(0) 321; SI-NEXT: s_xor_b32 s4, s6, 0x80000000 322; SI-NEXT: v_mov_b32_e32 v0, s4 323; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 324; SI-NEXT: s_endpgm 325; 326; VI-LABEL: s_fneg_i32: 327; VI: ; %bb.0: 328; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 329; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 330; VI-NEXT: s_waitcnt lgkmcnt(0) 331; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 332; VI-NEXT: v_mov_b32_e32 v0, s0 333; VI-NEXT: v_mov_b32_e32 v1, s1 334; VI-NEXT: v_mov_b32_e32 v2, s2 335; VI-NEXT: flat_store_dword v[0:1], v2 336; VI-NEXT: s_endpgm 337; 338; GFX11-LABEL: s_fneg_i32: 339; GFX11: ; %bb.0: 340; GFX11-NEXT: s_clause 0x1 341; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 342; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 343; GFX11-NEXT: s_waitcnt lgkmcnt(0) 344; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 345; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 346; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 347; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 348; GFX11-NEXT: s_endpgm 349 %fneg = xor i32 %in, -2147483648 350 store i32 %fneg, ptr addrspace(1) %out 351 ret void 352} 353 354define i32 @v_fneg_i32(i32 %in) { 355; GCN-LABEL: v_fneg_i32: 356; GCN: ; %bb.0: 357; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 358; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 359; GCN-NEXT: s_setpc_b64 s[30:31] 360 %fneg = xor i32 %in, -2147483648 361 ret i32 %fneg 362} 363 364define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { 365; SI-LABEL: s_fneg_i32_fp_use: 366; SI: ; %bb.0: 367; SI-NEXT: s_load_dword s6, s[4:5], 0xb 368; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 369; SI-NEXT: s_mov_b32 s3, 0xf000 370; SI-NEXT: s_mov_b32 s2, -1 371; SI-NEXT: s_waitcnt lgkmcnt(0) 372; SI-NEXT: v_sub_f32_e64 v0, 2.0, s6 373; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 374; SI-NEXT: s_endpgm 375; 376; VI-LABEL: s_fneg_i32_fp_use: 377; VI: ; %bb.0: 378; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 379; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 380; VI-NEXT: s_waitcnt lgkmcnt(0) 381; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2 382; VI-NEXT: v_mov_b32_e32 v0, s0 383; VI-NEXT: v_mov_b32_e32 v1, s1 384; VI-NEXT: flat_store_dword v[0:1], v2 385; VI-NEXT: s_endpgm 386; 387; GFX11-LABEL: s_fneg_i32_fp_use: 388; GFX11: ; %bb.0: 389; GFX11-NEXT: s_clause 0x1 390; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 391; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 392; GFX11-NEXT: v_mov_b32_e32 v0, 0 393; GFX11-NEXT: s_waitcnt lgkmcnt(0) 394; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2 395; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 396; GFX11-NEXT: s_endpgm 397 %fneg = xor i32 %in, -2147483648 398 %bitcast = bitcast i32 %fneg to float 399 %fadd = fadd float %bitcast, 2.0 400 store float %fadd, ptr addrspace(1) %out 401 ret void 402} 403 404define float @v_fneg_i32_fp_use(i32 %in) { 405; GCN-LABEL: v_fneg_i32_fp_use: 406; GCN: ; %bb.0: 407; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 408; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 409; GCN-NEXT: s_setpc_b64 s[30:31] 410 %fneg = xor i32 %in, -2147483648 411 %bitcast = bitcast i32 %fneg to float 412 %fadd = fadd float %bitcast, 2.0 413 ret float %fadd 414} 415 416define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) { 417; SI-LABEL: s_fneg_i64: 418; SI: ; %bb.0: 419; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 420; SI-NEXT: s_mov_b32 s7, 0xf000 421; SI-NEXT: s_mov_b32 s6, -1 422; SI-NEXT: s_waitcnt lgkmcnt(0) 423; SI-NEXT: s_mov_b32 s4, s0 424; SI-NEXT: s_xor_b32 s0, s3, 0x80000000 425; SI-NEXT: s_mov_b32 s5, s1 426; SI-NEXT: v_mov_b32_e32 v0, s2 427; SI-NEXT: v_mov_b32_e32 v1, s0 428; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 429; SI-NEXT: s_endpgm 430; 431; VI-LABEL: s_fneg_i64: 432; VI: ; %bb.0: 433; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 434; VI-NEXT: s_waitcnt lgkmcnt(0) 435; VI-NEXT: v_mov_b32_e32 v0, s0 436; VI-NEXT: s_xor_b32 s0, s3, 0x80000000 437; VI-NEXT: v_mov_b32_e32 v1, s1 438; VI-NEXT: v_mov_b32_e32 v2, s2 439; VI-NEXT: v_mov_b32_e32 v3, s0 440; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 441; VI-NEXT: s_endpgm 442; 443; GFX11-LABEL: s_fneg_i64: 444; GFX11: ; %bb.0: 445; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 446; GFX11-NEXT: s_waitcnt lgkmcnt(0) 447; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 448; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 449; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 450; GFX11-NEXT: v_mov_b32_e32 v0, s2 451; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 452; GFX11-NEXT: s_endpgm 453 %fneg = xor i64 %in, -9223372036854775808 454 store i64 %fneg, ptr addrspace(1) %out 455 ret void 456} 457 458define i64 @v_fneg_i64(i64 %in) { 459; GCN-LABEL: v_fneg_i64: 460; GCN: ; %bb.0: 461; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 462; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 463; GCN-NEXT: s_setpc_b64 s[30:31] 464 %fneg = xor i64 %in, -9223372036854775808 465 ret i64 %fneg 466} 467 468define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { 469; SI-LABEL: s_fneg_i64_fp_use: 470; SI: ; %bb.0: 471; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 472; SI-NEXT: s_mov_b32 s7, 0xf000 473; SI-NEXT: s_mov_b32 s6, -1 474; SI-NEXT: s_waitcnt lgkmcnt(0) 475; SI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 476; SI-NEXT: s_mov_b32 s4, s0 477; SI-NEXT: s_mov_b32 s5, s1 478; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 479; SI-NEXT: s_endpgm 480; 481; VI-LABEL: s_fneg_i64_fp_use: 482; VI: ; %bb.0: 483; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 484; VI-NEXT: s_waitcnt lgkmcnt(0) 485; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 486; VI-NEXT: v_mov_b32_e32 v2, s0 487; VI-NEXT: v_mov_b32_e32 v3, s1 488; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 489; VI-NEXT: s_endpgm 490; 491; GFX11-LABEL: s_fneg_i64_fp_use: 492; GFX11: ; %bb.0: 493; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 494; GFX11-NEXT: v_mov_b32_e32 v2, 0 495; GFX11-NEXT: s_waitcnt lgkmcnt(0) 496; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0 497; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 498; GFX11-NEXT: s_endpgm 499 %fneg = xor i64 %in, -9223372036854775808 500 %bitcast = bitcast i64 %fneg to double 501 %fadd = fadd double %bitcast, 2.0 502 store double %fadd, ptr addrspace(1) %out 503 ret void 504} 505 506define double @v_fneg_i64_fp_use(i64 %in) { 507; GCN-LABEL: v_fneg_i64_fp_use: 508; GCN: ; %bb.0: 509; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 510; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 511; GCN-NEXT: s_setpc_b64 s[30:31] 512 %fneg = xor i64 %in, -9223372036854775808 513 %bitcast = bitcast i64 %fneg to double 514 %fadd = fadd double %bitcast, 2.0 515 ret double %fadd 516} 517 518define i16 @v_fneg_i16(i16 %in) { 519; GCN-LABEL: v_fneg_i16: 520; GCN: ; %bb.0: 521; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 522; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 523; GCN-NEXT: s_setpc_b64 s[30:31] 524 %fneg = xor i16 %in, -32768 525 ret i16 %fneg 526} 527 528define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { 529; SI-LABEL: s_fneg_i16_fp_use: 530; SI: ; %bb.0: 531; SI-NEXT: s_load_dword s0, s[4:5], 0xb 532; SI-NEXT: s_mov_b32 s3, 0xf000 533; SI-NEXT: s_mov_b32 s2, -1 534; SI-NEXT: s_waitcnt lgkmcnt(0) 535; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 536; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 537; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 538; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 539; SI-NEXT: s_waitcnt lgkmcnt(0) 540; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 541; SI-NEXT: s_endpgm 542; 543; VI-LABEL: s_fneg_i16_fp_use: 544; VI: ; %bb.0: 545; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 546; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 547; VI-NEXT: s_waitcnt lgkmcnt(0) 548; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2 549; VI-NEXT: v_mov_b32_e32 v0, s0 550; VI-NEXT: v_mov_b32_e32 v1, s1 551; VI-NEXT: flat_store_short v[0:1], v2 552; VI-NEXT: s_endpgm 553; 554; GFX11-LABEL: s_fneg_i16_fp_use: 555; GFX11: ; %bb.0: 556; GFX11-NEXT: s_clause 0x1 557; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 558; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 559; GFX11-NEXT: v_mov_b32_e32 v0, 0 560; GFX11-NEXT: s_waitcnt lgkmcnt(0) 561; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2 562; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 563; GFX11-NEXT: s_endpgm 564 %fneg = xor i16 %in, -32768 565 %bitcast = bitcast i16 %fneg to half 566 %fadd = fadd half %bitcast, 2.0 567 store half %fadd, ptr addrspace(1) %out 568 ret void 569} 570 571define half @v_fneg_i16_fp_use(i16 %in) { 572; SI-LABEL: v_fneg_i16_fp_use: 573; SI: ; %bb.0: 574; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 575; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 576; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 577; SI-NEXT: s_setpc_b64 s[30:31] 578; 579; VI-LABEL: v_fneg_i16_fp_use: 580; VI: ; %bb.0: 581; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 582; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 583; VI-NEXT: s_setpc_b64 s[30:31] 584; 585; GFX11-LABEL: v_fneg_i16_fp_use: 586; GFX11: ; %bb.0: 587; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 588; GFX11-NEXT: v_sub_f16_e32 v0, 2.0, v0 589; GFX11-NEXT: s_setpc_b64 s[30:31] 590 %fneg = xor i16 %in, -32768 591 %bitcast = bitcast i16 %fneg to half 592 %fadd = fadd half %bitcast, 2.0 593 ret half %fadd 594} 595 596define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) { 597; SI-LABEL: s_fneg_v2i16: 598; SI: ; %bb.0: 599; SI-NEXT: s_load_dword s6, s[4:5], 0xb 600; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 601; SI-NEXT: s_mov_b32 s3, 0xf000 602; SI-NEXT: s_mov_b32 s2, -1 603; SI-NEXT: s_waitcnt lgkmcnt(0) 604; SI-NEXT: s_xor_b32 s4, s6, 0x80008000 605; SI-NEXT: v_mov_b32_e32 v0, s4 606; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 607; SI-NEXT: s_endpgm 608; 609; VI-LABEL: s_fneg_v2i16: 610; VI: ; %bb.0: 611; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 612; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 613; VI-NEXT: s_waitcnt lgkmcnt(0) 614; VI-NEXT: s_lshr_b32 s3, s2, 16 615; VI-NEXT: s_xor_b32 s2, s2, 0x8000 616; VI-NEXT: s_xor_b32 s3, s3, 0x8000 617; VI-NEXT: s_and_b32 s2, s2, 0xffff 618; VI-NEXT: s_lshl_b32 s3, s3, 16 619; VI-NEXT: s_or_b32 s2, s2, s3 620; VI-NEXT: v_mov_b32_e32 v0, s0 621; VI-NEXT: v_mov_b32_e32 v1, s1 622; VI-NEXT: v_mov_b32_e32 v2, s2 623; VI-NEXT: flat_store_dword v[0:1], v2 624; VI-NEXT: s_endpgm 625; 626; GFX11-LABEL: s_fneg_v2i16: 627; GFX11: ; %bb.0: 628; GFX11-NEXT: s_clause 0x1 629; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 630; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 631; GFX11-NEXT: s_waitcnt lgkmcnt(0) 632; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000 633; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 634; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 635; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 636; GFX11-NEXT: s_endpgm 637 %in = bitcast i32 %arg to <2 x i16> 638 %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768> 639 store <2 x i16> %fneg, ptr addrspace(1) %out 640 ret void 641} 642 643define <2 x i16> @v_fneg_v2i16(<2 x i16> %in) { 644; SI-LABEL: v_fneg_v2i16: 645; SI: ; %bb.0: 646; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 647; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 648; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 649; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 650; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 651; SI-NEXT: v_or_b32_e32 v0, v0, v2 652; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 653; SI-NEXT: s_setpc_b64 s[30:31] 654; 655; VI-LABEL: v_fneg_v2i16: 656; VI: ; %bb.0: 657; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 658; VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 659; VI-NEXT: s_setpc_b64 s[30:31] 660; 661; GFX11-LABEL: v_fneg_v2i16: 662; GFX11: ; %bb.0: 663; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 664; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 665; GFX11-NEXT: s_setpc_b64 s[30:31] 666 %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768> 667 ret <2 x i16> %fneg 668} 669 670define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg) { 671; SI-LABEL: s_fneg_v2i16_fp_use: 672; SI: ; %bb.0: 673; SI-NEXT: s_load_dword s0, s[4:5], 0xb 674; SI-NEXT: s_mov_b32 s3, 0xf000 675; SI-NEXT: s_mov_b32 s2, -1 676; SI-NEXT: s_waitcnt lgkmcnt(0) 677; SI-NEXT: s_lshr_b32 s1, s0, 16 678; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 679; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 680; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 681; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 682; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 683; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 684; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 685; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 686; SI-NEXT: v_or_b32_e32 v0, v1, v0 687; SI-NEXT: s_waitcnt lgkmcnt(0) 688; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 689; SI-NEXT: s_endpgm 690; 691; VI-LABEL: s_fneg_v2i16_fp_use: 692; VI: ; %bb.0: 693; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 694; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 695; VI-NEXT: v_mov_b32_e32 v0, 0x4000 696; VI-NEXT: s_waitcnt lgkmcnt(0) 697; VI-NEXT: s_lshr_b32 s3, s2, 16 698; VI-NEXT: s_xor_b32 s3, s3, 0x8000 699; VI-NEXT: s_xor_b32 s2, s2, 0x8000 700; VI-NEXT: v_mov_b32_e32 v2, s3 701; VI-NEXT: v_add_f16_e64 v1, s2, 2.0 702; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 703; VI-NEXT: v_or_b32_e32 v2, v1, v0 704; VI-NEXT: v_mov_b32_e32 v0, s0 705; VI-NEXT: v_mov_b32_e32 v1, s1 706; VI-NEXT: flat_store_dword v[0:1], v2 707; VI-NEXT: s_endpgm 708; 709; GFX11-LABEL: s_fneg_v2i16_fp_use: 710; GFX11: ; %bb.0: 711; GFX11-NEXT: s_clause 0x1 712; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 713; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 714; GFX11-NEXT: v_mov_b32_e32 v0, 0 715; GFX11-NEXT: s_waitcnt lgkmcnt(0) 716; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] 717; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 718; GFX11-NEXT: s_endpgm 719 %in = bitcast i32 %arg to <2 x i16> 720 %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768> 721 %bitcast = bitcast <2 x i16> %fneg to <2 x half> 722 %fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0> 723 store <2 x half> %fadd, ptr addrspace(1) %out 724 ret void 725} 726 727define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) { 728; SI-LABEL: v_fneg_v2i16_fp_use: 729; SI: ; %bb.0: 730; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 731; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 732; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 733; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 734; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0 735; SI-NEXT: v_sub_f32_e32 v1, 2.0, v1 736; SI-NEXT: s_setpc_b64 s[30:31] 737; 738; VI-LABEL: v_fneg_v2i16_fp_use: 739; VI: ; %bb.0: 740; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 741; VI-NEXT: v_mov_b32_e32 v1, 0x4000 742; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 743; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 744; VI-NEXT: v_or_b32_e32 v0, v0, v1 745; VI-NEXT: s_setpc_b64 s[30:31] 746; 747; GFX11-LABEL: v_fneg_v2i16_fp_use: 748; GFX11: ; %bb.0: 749; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 750; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] 751; GFX11-NEXT: s_setpc_b64 s[30:31] 752 %in = bitcast i32 %arg to <2 x i16> 753 %fneg = xor <2 x i16> %in, <i16 -32768, i16 -32768> 754 %bitcast = bitcast <2 x i16> %fneg to <2 x half> 755 %fadd = fadd <2 x half> %bitcast, <half 2.0, half 2.0> 756 ret <2 x half> %fadd 757} 758