1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s 5 6define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag, float %sign) { 7; SI-LABEL: s_test_copysign_f32: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 10; SI-NEXT: s_brev_b32 s8, -2 11; SI-NEXT: s_mov_b32 s7, 0xf000 12; SI-NEXT: s_mov_b32 s6, -1 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: v_mov_b32_e32 v0, s2 15; SI-NEXT: v_mov_b32_e32 v1, s3 16; SI-NEXT: s_mov_b32 s4, s0 17; SI-NEXT: s_mov_b32 s5, s1 18; SI-NEXT: v_bfi_b32 v0, s8, v0, v1 19; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: s_test_copysign_f32: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 25; VI-NEXT: s_brev_b32 s4, -2 26; VI-NEXT: s_waitcnt lgkmcnt(0) 27; VI-NEXT: v_mov_b32_e32 v0, s2 28; VI-NEXT: v_mov_b32_e32 v1, s3 29; VI-NEXT: v_bfi_b32 v2, s4, v0, v1 30; VI-NEXT: v_mov_b32_e32 v0, s0 31; VI-NEXT: v_mov_b32_e32 v1, s1 32; VI-NEXT: flat_store_dword v[0:1], v2 33; VI-NEXT: s_endpgm 34; 35; GFX11-LABEL: s_test_copysign_f32: 36; GFX11: ; %bb.0: 37; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 38; GFX11-NEXT: s_waitcnt lgkmcnt(0) 39; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 40; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 41; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 42; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 43; GFX11-NEXT: s_endpgm 44 %result = call float @llvm.copysign.f32(float %mag, float %sign) 45 store float %result, ptr addrspace(1) %out, align 4 46 ret void 47} 48 49define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %mag) { 50; SI-LABEL: s_test_copysign_f32_0: 51; SI: ; %bb.0: 52; SI-NEXT: s_load_dword s6, s[4:5], 0xb 53; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 54; SI-NEXT: s_mov_b32 s3, 0xf000 55; SI-NEXT: s_mov_b32 s2, -1 56; SI-NEXT: s_waitcnt lgkmcnt(0) 57; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff 58; SI-NEXT: v_mov_b32_e32 v0, s4 59; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 60; SI-NEXT: s_endpgm 61; 62; VI-LABEL: s_test_copysign_f32_0: 63; VI: ; %bb.0: 64; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 65; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 66; VI-NEXT: s_waitcnt lgkmcnt(0) 67; VI-NEXT: s_bitset0_b32 s2, 31 68; VI-NEXT: v_mov_b32_e32 v0, s0 69; VI-NEXT: v_mov_b32_e32 v1, s1 70; VI-NEXT: v_mov_b32_e32 v2, s2 71; VI-NEXT: flat_store_dword v[0:1], v2 72; VI-NEXT: s_endpgm 73; 74; GFX11-LABEL: s_test_copysign_f32_0: 75; GFX11: ; %bb.0: 76; GFX11-NEXT: s_clause 0x1 77; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 78; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 79; GFX11-NEXT: s_waitcnt lgkmcnt(0) 80; GFX11-NEXT: s_bitset0_b32 s2, 31 81; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 82; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 83; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 84; GFX11-NEXT: s_endpgm 85 %result = call float @llvm.copysign.f32(float %mag, float 0.0) 86 store float %result, ptr addrspace(1) %out, align 4 87 ret void 88} 89 90define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %mag) { 91; SI-LABEL: s_test_copysign_f32_1: 92; SI: ; %bb.0: 93; SI-NEXT: s_load_dword s6, s[4:5], 0xb 94; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 95; SI-NEXT: s_mov_b32 s3, 0xf000 96; SI-NEXT: s_mov_b32 s2, -1 97; SI-NEXT: s_waitcnt lgkmcnt(0) 98; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff 99; SI-NEXT: v_mov_b32_e32 v0, s4 100; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 101; SI-NEXT: s_endpgm 102; 103; VI-LABEL: s_test_copysign_f32_1: 104; VI: ; %bb.0: 105; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 106; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 107; VI-NEXT: s_waitcnt lgkmcnt(0) 108; VI-NEXT: s_bitset0_b32 s2, 31 109; VI-NEXT: v_mov_b32_e32 v0, s0 110; VI-NEXT: v_mov_b32_e32 v1, s1 111; VI-NEXT: v_mov_b32_e32 v2, s2 112; VI-NEXT: flat_store_dword v[0:1], v2 113; VI-NEXT: s_endpgm 114; 115; GFX11-LABEL: s_test_copysign_f32_1: 116; GFX11: ; %bb.0: 117; GFX11-NEXT: s_clause 0x1 118; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 119; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 120; GFX11-NEXT: s_waitcnt lgkmcnt(0) 121; GFX11-NEXT: s_bitset0_b32 s2, 31 122; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 123; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 124; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 125; GFX11-NEXT: s_endpgm 126 %result = call float @llvm.copysign.f32(float %mag, float 1.0) 127 store float %result, ptr addrspace(1) %out, align 4 128 ret void 129} 130 131define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float %mag) { 132; SI-LABEL: s_test_copysign_f32_10.0: 133; SI: ; %bb.0: 134; SI-NEXT: s_load_dword s6, s[4:5], 0xb 135; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 136; SI-NEXT: s_mov_b32 s3, 0xf000 137; SI-NEXT: s_mov_b32 s2, -1 138; SI-NEXT: s_waitcnt lgkmcnt(0) 139; SI-NEXT: s_and_b32 s4, s6, 0x7fffffff 140; SI-NEXT: v_mov_b32_e32 v0, s4 141; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 142; SI-NEXT: s_endpgm 143; 144; VI-LABEL: s_test_copysign_f32_10.0: 145; VI: ; %bb.0: 146; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 147; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 148; VI-NEXT: s_waitcnt lgkmcnt(0) 149; VI-NEXT: s_bitset0_b32 s2, 31 150; VI-NEXT: v_mov_b32_e32 v0, s0 151; VI-NEXT: v_mov_b32_e32 v1, s1 152; VI-NEXT: v_mov_b32_e32 v2, s2 153; VI-NEXT: flat_store_dword v[0:1], v2 154; VI-NEXT: s_endpgm 155; 156; GFX11-LABEL: s_test_copysign_f32_10.0: 157; GFX11: ; %bb.0: 158; GFX11-NEXT: s_clause 0x1 159; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 160; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 161; GFX11-NEXT: s_waitcnt lgkmcnt(0) 162; GFX11-NEXT: s_bitset0_b32 s2, 31 163; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 164; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 165; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 166; GFX11-NEXT: s_endpgm 167 %result = call float @llvm.copysign.f32(float %mag, float 10.0) 168 store float %result, ptr addrspace(1) %out, align 4 169 ret void 170} 171 172define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float %mag) { 173; SI-LABEL: s_test_copysign_f32_neg1: 174; SI: ; %bb.0: 175; SI-NEXT: s_load_dword s6, s[4:5], 0xb 176; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 177; SI-NEXT: s_mov_b32 s3, 0xf000 178; SI-NEXT: s_mov_b32 s2, -1 179; SI-NEXT: s_waitcnt lgkmcnt(0) 180; SI-NEXT: s_or_b32 s4, s6, 0x80000000 181; SI-NEXT: v_mov_b32_e32 v0, s4 182; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 183; SI-NEXT: s_endpgm 184; 185; VI-LABEL: s_test_copysign_f32_neg1: 186; VI: ; %bb.0: 187; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 188; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 189; VI-NEXT: s_waitcnt lgkmcnt(0) 190; VI-NEXT: s_bitset1_b32 s2, 31 191; VI-NEXT: v_mov_b32_e32 v0, s0 192; VI-NEXT: v_mov_b32_e32 v1, s1 193; VI-NEXT: v_mov_b32_e32 v2, s2 194; VI-NEXT: flat_store_dword v[0:1], v2 195; VI-NEXT: s_endpgm 196; 197; GFX11-LABEL: s_test_copysign_f32_neg1: 198; GFX11: ; %bb.0: 199; GFX11-NEXT: s_clause 0x1 200; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 201; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 202; GFX11-NEXT: s_waitcnt lgkmcnt(0) 203; GFX11-NEXT: s_bitset1_b32 s2, 31 204; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 205; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 206; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 207; GFX11-NEXT: s_endpgm 208 %result = call float @llvm.copysign.f32(float %mag, float -1.0) 209 store float %result, ptr addrspace(1) %out, align 4 210 ret void 211} 212 213define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, float %mag) { 214; SI-LABEL: s_test_copysign_f32_neg10: 215; SI: ; %bb.0: 216; SI-NEXT: s_load_dword s6, s[4:5], 0xb 217; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 218; SI-NEXT: s_mov_b32 s3, 0xf000 219; SI-NEXT: s_mov_b32 s2, -1 220; SI-NEXT: s_waitcnt lgkmcnt(0) 221; SI-NEXT: s_or_b32 s4, s6, 0x80000000 222; SI-NEXT: v_mov_b32_e32 v0, s4 223; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 224; SI-NEXT: s_endpgm 225; 226; VI-LABEL: s_test_copysign_f32_neg10: 227; VI: ; %bb.0: 228; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 229; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 230; VI-NEXT: s_waitcnt lgkmcnt(0) 231; VI-NEXT: s_bitset1_b32 s2, 31 232; VI-NEXT: v_mov_b32_e32 v0, s0 233; VI-NEXT: v_mov_b32_e32 v1, s1 234; VI-NEXT: v_mov_b32_e32 v2, s2 235; VI-NEXT: flat_store_dword v[0:1], v2 236; VI-NEXT: s_endpgm 237; 238; GFX11-LABEL: s_test_copysign_f32_neg10: 239; GFX11: ; %bb.0: 240; GFX11-NEXT: s_clause 0x1 241; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 242; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 243; GFX11-NEXT: s_waitcnt lgkmcnt(0) 244; GFX11-NEXT: s_bitset1_b32 s2, 31 245; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 246; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 247; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 248; GFX11-NEXT: s_endpgm 249 %result = call float @llvm.copysign.f32(float %mag, float -10.0) 250 store float %result, ptr addrspace(1) %out, align 4 251 ret void 252} 253 254define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, float %sign) { 255; SI-LABEL: s_test_copysign_f32_0_mag: 256; SI: ; %bb.0: 257; SI-NEXT: s_load_dword s6, s[4:5], 0xb 258; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 259; SI-NEXT: s_mov_b32 s3, 0xf000 260; SI-NEXT: s_mov_b32 s2, -1 261; SI-NEXT: s_waitcnt lgkmcnt(0) 262; SI-NEXT: s_and_b32 s4, s6, 0x80000000 263; SI-NEXT: v_mov_b32_e32 v0, s4 264; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 265; SI-NEXT: s_endpgm 266; 267; VI-LABEL: s_test_copysign_f32_0_mag: 268; VI: ; %bb.0: 269; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 270; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 271; VI-NEXT: s_waitcnt lgkmcnt(0) 272; VI-NEXT: s_and_b32 s2, s2, 0x80000000 273; VI-NEXT: v_mov_b32_e32 v0, s0 274; VI-NEXT: v_mov_b32_e32 v1, s1 275; VI-NEXT: v_mov_b32_e32 v2, s2 276; VI-NEXT: flat_store_dword v[0:1], v2 277; VI-NEXT: s_endpgm 278; 279; GFX11-LABEL: s_test_copysign_f32_0_mag: 280; GFX11: ; %bb.0: 281; GFX11-NEXT: s_clause 0x1 282; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 283; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 284; GFX11-NEXT: s_waitcnt lgkmcnt(0) 285; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 286; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 287; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 288; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 289; GFX11-NEXT: s_endpgm 290 %result = call float @llvm.copysign.f32(float 0.0, float %sign) 291 store float %result, ptr addrspace(1) %out, align 4 292 ret void 293} 294 295 296define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, float %sign) { 297; SI-LABEL: s_test_copysign_f32_1_mag: 298; SI: ; %bb.0: 299; SI-NEXT: s_load_dword s6, s[4:5], 0xb 300; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 301; SI-NEXT: s_mov_b32 s3, 0xf000 302; SI-NEXT: s_mov_b32 s2, -1 303; SI-NEXT: s_waitcnt lgkmcnt(0) 304; SI-NEXT: s_and_b32 s4, s6, 0x80000000 305; SI-NEXT: s_or_b32 s4, s4, 1.0 306; SI-NEXT: v_mov_b32_e32 v0, s4 307; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 308; SI-NEXT: s_endpgm 309; 310; VI-LABEL: s_test_copysign_f32_1_mag: 311; VI: ; %bb.0: 312; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 313; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 314; VI-NEXT: s_waitcnt lgkmcnt(0) 315; VI-NEXT: s_and_b32 s2, s2, 0x80000000 316; VI-NEXT: s_or_b32 s2, s2, 1.0 317; VI-NEXT: v_mov_b32_e32 v0, s0 318; VI-NEXT: v_mov_b32_e32 v1, s1 319; VI-NEXT: v_mov_b32_e32 v2, s2 320; VI-NEXT: flat_store_dword v[0:1], v2 321; VI-NEXT: s_endpgm 322; 323; GFX11-LABEL: s_test_copysign_f32_1_mag: 324; GFX11: ; %bb.0: 325; GFX11-NEXT: s_clause 0x1 326; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 327; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 328; GFX11-NEXT: s_waitcnt lgkmcnt(0) 329; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 330; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 331; GFX11-NEXT: s_or_b32 s2, s2, 1.0 332; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 333; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 334; GFX11-NEXT: s_endpgm 335 %result = call float @llvm.copysign.f32(float 1.0, float %sign) 336 store float %result, ptr addrspace(1) %out, align 4 337 ret void 338} 339 340define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, float %sign) { 341; SI-LABEL: s_test_copysign_f32_10_mag: 342; SI: ; %bb.0: 343; SI-NEXT: s_load_dword s6, s[4:5], 0xb 344; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 345; SI-NEXT: s_mov_b32 s3, 0xf000 346; SI-NEXT: s_mov_b32 s2, -1 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_and_b32 s4, s6, 0x80000000 349; SI-NEXT: s_or_b32 s4, s4, 0x41200000 350; SI-NEXT: v_mov_b32_e32 v0, s4 351; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 352; SI-NEXT: s_endpgm 353; 354; VI-LABEL: s_test_copysign_f32_10_mag: 355; VI: ; %bb.0: 356; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 357; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 358; VI-NEXT: s_waitcnt lgkmcnt(0) 359; VI-NEXT: s_and_b32 s2, s2, 0x80000000 360; VI-NEXT: s_or_b32 s2, s2, 0x41200000 361; VI-NEXT: v_mov_b32_e32 v0, s0 362; VI-NEXT: v_mov_b32_e32 v1, s1 363; VI-NEXT: v_mov_b32_e32 v2, s2 364; VI-NEXT: flat_store_dword v[0:1], v2 365; VI-NEXT: s_endpgm 366; 367; GFX11-LABEL: s_test_copysign_f32_10_mag: 368; GFX11: ; %bb.0: 369; GFX11-NEXT: s_clause 0x1 370; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 371; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 372; GFX11-NEXT: s_waitcnt lgkmcnt(0) 373; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 374; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 375; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 376; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 377; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 378; GFX11-NEXT: s_endpgm 379 %result = call float @llvm.copysign.f32(float 10.0, float %sign) 380 store float %result, ptr addrspace(1) %out, align 4 381 ret void 382} 383 384define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, float %sign) { 385; SI-LABEL: s_test_copysign_f32_neg1_mag: 386; SI: ; %bb.0: 387; SI-NEXT: s_load_dword s6, s[4:5], 0xb 388; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 389; SI-NEXT: s_mov_b32 s3, 0xf000 390; SI-NEXT: s_mov_b32 s2, -1 391; SI-NEXT: s_waitcnt lgkmcnt(0) 392; SI-NEXT: s_and_b32 s4, s6, 0x80000000 393; SI-NEXT: s_or_b32 s4, s4, 1.0 394; SI-NEXT: v_mov_b32_e32 v0, s4 395; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 396; SI-NEXT: s_endpgm 397; 398; VI-LABEL: s_test_copysign_f32_neg1_mag: 399; VI: ; %bb.0: 400; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 401; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: s_and_b32 s2, s2, 0x80000000 404; VI-NEXT: s_or_b32 s2, s2, 1.0 405; VI-NEXT: v_mov_b32_e32 v0, s0 406; VI-NEXT: v_mov_b32_e32 v1, s1 407; VI-NEXT: v_mov_b32_e32 v2, s2 408; VI-NEXT: flat_store_dword v[0:1], v2 409; VI-NEXT: s_endpgm 410; 411; GFX11-LABEL: s_test_copysign_f32_neg1_mag: 412; GFX11: ; %bb.0: 413; GFX11-NEXT: s_clause 0x1 414; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 415; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 416; GFX11-NEXT: s_waitcnt lgkmcnt(0) 417; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 418; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 419; GFX11-NEXT: s_or_b32 s2, s2, 1.0 420; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 421; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 422; GFX11-NEXT: s_endpgm 423 %result = call float @llvm.copysign.f32(float -1.0, float %sign) 424 store float %result, ptr addrspace(1) %out, align 4 425 ret void 426} 427 428define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out, float %sign) { 429; SI-LABEL: s_test_copysign_f32_neg10_mag: 430; SI: ; %bb.0: 431; SI-NEXT: s_load_dword s6, s[4:5], 0xb 432; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 433; SI-NEXT: s_mov_b32 s3, 0xf000 434; SI-NEXT: s_mov_b32 s2, -1 435; SI-NEXT: s_waitcnt lgkmcnt(0) 436; SI-NEXT: s_and_b32 s4, s6, 0x80000000 437; SI-NEXT: s_or_b32 s4, s4, 0x41200000 438; SI-NEXT: v_mov_b32_e32 v0, s4 439; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 440; SI-NEXT: s_endpgm 441; 442; VI-LABEL: s_test_copysign_f32_neg10_mag: 443; VI: ; %bb.0: 444; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 445; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 446; VI-NEXT: s_waitcnt lgkmcnt(0) 447; VI-NEXT: s_and_b32 s2, s2, 0x80000000 448; VI-NEXT: s_or_b32 s2, s2, 0x41200000 449; VI-NEXT: v_mov_b32_e32 v0, s0 450; VI-NEXT: v_mov_b32_e32 v1, s1 451; VI-NEXT: v_mov_b32_e32 v2, s2 452; VI-NEXT: flat_store_dword v[0:1], v2 453; VI-NEXT: s_endpgm 454; 455; GFX11-LABEL: s_test_copysign_f32_neg10_mag: 456; GFX11: ; %bb.0: 457; GFX11-NEXT: s_clause 0x1 458; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 459; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 460; GFX11-NEXT: s_waitcnt lgkmcnt(0) 461; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 462; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 463; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000 464; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 465; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 466; GFX11-NEXT: s_endpgm 467 %result = call float @llvm.copysign.f32(float -10.0, float %sign) 468 store float %result, ptr addrspace(1) %out, align 4 469 ret void 470} 471 472define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x float> %mag, <2 x float> %sign) { 473; SI-LABEL: s_test_copysign_v2f32: 474; SI: ; %bb.0: 475; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 476; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 477; SI-NEXT: s_brev_b32 s8, -2 478; SI-NEXT: s_mov_b32 s7, 0xf000 479; SI-NEXT: s_mov_b32 s6, -1 480; SI-NEXT: s_waitcnt lgkmcnt(0) 481; SI-NEXT: v_mov_b32_e32 v0, s1 482; SI-NEXT: v_mov_b32_e32 v1, s3 483; SI-NEXT: v_bfi_b32 v1, s8, v0, v1 484; SI-NEXT: v_mov_b32_e32 v0, s0 485; SI-NEXT: v_mov_b32_e32 v2, s2 486; SI-NEXT: v_bfi_b32 v0, s8, v0, v2 487; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 488; SI-NEXT: s_endpgm 489; 490; VI-LABEL: s_test_copysign_v2f32: 491; VI: ; %bb.0: 492; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 493; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 494; VI-NEXT: s_brev_b32 s6, -2 495; VI-NEXT: s_waitcnt lgkmcnt(0) 496; VI-NEXT: v_mov_b32_e32 v0, s1 497; VI-NEXT: v_mov_b32_e32 v1, s3 498; VI-NEXT: v_mov_b32_e32 v2, s0 499; VI-NEXT: v_bfi_b32 v1, s6, v0, v1 500; VI-NEXT: v_mov_b32_e32 v0, s2 501; VI-NEXT: v_bfi_b32 v0, s6, v2, v0 502; VI-NEXT: v_mov_b32_e32 v2, s4 503; VI-NEXT: v_mov_b32_e32 v3, s5 504; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 505; VI-NEXT: s_endpgm 506; 507; GFX11-LABEL: s_test_copysign_v2f32: 508; GFX11: ; %bb.0: 509; GFX11-NEXT: s_clause 0x1 510; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 511; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 512; GFX11-NEXT: s_waitcnt lgkmcnt(0) 513; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3 514; GFX11-NEXT: v_mov_b32_e32 v2, s2 515; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 516; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 517; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v2 518; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5] 519; GFX11-NEXT: s_endpgm 520 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) 521 store <2 x float> %result, ptr addrspace(1) %out, align 8 522 ret void 523} 524 525define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x float> %mag, <3 x float> %sign) { 526; SI-LABEL: s_test_copysign_v3f32: 527; SI: ; %bb.0: 528; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 529; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 530; SI-NEXT: s_brev_b32 s6, -2 531; SI-NEXT: s_mov_b32 s3, 0xf000 532; SI-NEXT: s_mov_b32 s2, -1 533; SI-NEXT: s_waitcnt lgkmcnt(0) 534; SI-NEXT: v_mov_b32_e32 v0, s9 535; SI-NEXT: v_mov_b32_e32 v1, s13 536; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 537; SI-NEXT: v_mov_b32_e32 v0, s8 538; SI-NEXT: v_mov_b32_e32 v2, s12 539; SI-NEXT: v_bfi_b32 v0, s6, v0, v2 540; SI-NEXT: v_mov_b32_e32 v2, s10 541; SI-NEXT: v_mov_b32_e32 v3, s14 542; SI-NEXT: v_bfi_b32 v2, s6, v2, v3 543; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 544; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 545; SI-NEXT: s_endpgm 546; 547; VI-LABEL: s_test_copysign_v3f32: 548; VI: ; %bb.0: 549; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 550; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 551; VI-NEXT: s_brev_b32 s2, -2 552; VI-NEXT: s_waitcnt lgkmcnt(0) 553; VI-NEXT: v_mov_b32_e32 v0, s10 554; VI-NEXT: v_mov_b32_e32 v1, s14 555; VI-NEXT: v_mov_b32_e32 v3, s9 556; VI-NEXT: v_bfi_b32 v2, s2, v0, v1 557; VI-NEXT: v_mov_b32_e32 v0, s13 558; VI-NEXT: v_bfi_b32 v1, s2, v3, v0 559; VI-NEXT: v_mov_b32_e32 v0, s8 560; VI-NEXT: v_mov_b32_e32 v3, s12 561; VI-NEXT: v_bfi_b32 v0, s2, v0, v3 562; VI-NEXT: v_mov_b32_e32 v4, s1 563; VI-NEXT: v_mov_b32_e32 v3, s0 564; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 565; VI-NEXT: s_endpgm 566; 567; GFX11-LABEL: s_test_copysign_v3f32: 568; GFX11: ; %bb.0: 569; GFX11-NEXT: s_clause 0x1 570; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 571; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 572; GFX11-NEXT: v_mov_b32_e32 v4, 0 573; GFX11-NEXT: s_waitcnt lgkmcnt(0) 574; GFX11-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v1, s13 575; GFX11-NEXT: v_mov_b32_e32 v3, s12 576; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 577; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s10, v0 578; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v1 579; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) 580; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s8, v3 581; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1] 582; GFX11-NEXT: s_endpgm 583 %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) 584 store <3 x float> %result, ptr addrspace(1) %out, align 16 585 ret void 586} 587 588define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x float> %mag, <4 x float> %sign) { 589; SI-LABEL: s_test_copysign_v4f32: 590; SI: ; %bb.0: 591; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 592; SI-NEXT: s_brev_b32 s6, -2 593; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 594; SI-NEXT: s_mov_b32 s3, 0xf000 595; SI-NEXT: s_mov_b32 s2, -1 596; SI-NEXT: s_waitcnt lgkmcnt(0) 597; SI-NEXT: v_mov_b32_e32 v0, s11 598; SI-NEXT: v_mov_b32_e32 v1, s15 599; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 600; SI-NEXT: v_mov_b32_e32 v0, s10 601; SI-NEXT: v_mov_b32_e32 v1, s14 602; SI-NEXT: v_bfi_b32 v2, s6, v0, v1 603; SI-NEXT: v_mov_b32_e32 v0, s9 604; SI-NEXT: v_mov_b32_e32 v1, s13 605; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 606; SI-NEXT: v_mov_b32_e32 v0, s8 607; SI-NEXT: v_mov_b32_e32 v4, s12 608; SI-NEXT: v_bfi_b32 v0, s6, v0, v4 609; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 610; SI-NEXT: s_endpgm 611; 612; VI-LABEL: s_test_copysign_v4f32: 613; VI: ; %bb.0: 614; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 615; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 616; VI-NEXT: s_brev_b32 s2, -2 617; VI-NEXT: s_waitcnt lgkmcnt(0) 618; VI-NEXT: v_mov_b32_e32 v0, s11 619; VI-NEXT: v_mov_b32_e32 v1, s15 620; VI-NEXT: v_mov_b32_e32 v2, s10 621; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 622; VI-NEXT: v_mov_b32_e32 v0, s14 623; VI-NEXT: v_bfi_b32 v2, s2, v2, v0 624; VI-NEXT: v_mov_b32_e32 v0, s9 625; VI-NEXT: v_mov_b32_e32 v1, s13 626; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 627; VI-NEXT: v_mov_b32_e32 v0, s8 628; VI-NEXT: v_mov_b32_e32 v4, s12 629; VI-NEXT: v_bfi_b32 v0, s2, v0, v4 630; VI-NEXT: v_mov_b32_e32 v5, s1 631; VI-NEXT: v_mov_b32_e32 v4, s0 632; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 633; VI-NEXT: s_endpgm 634; 635; GFX11-LABEL: s_test_copysign_v4f32: 636; GFX11: ; %bb.0: 637; GFX11-NEXT: s_clause 0x1 638; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 639; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 640; GFX11-NEXT: v_mov_b32_e32 v6, 0 641; GFX11-NEXT: s_waitcnt lgkmcnt(0) 642; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14 643; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v5, s12 644; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 645; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v0 646; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, s10, v1 647; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) 648; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v4 649; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s8, v5 650; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 651; GFX11-NEXT: s_endpgm 652 %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) 653 store <4 x float> %result, ptr addrspace(1) %out, align 16 654 ret void 655} 656 657define float @v_test_copysign_f32(float %mag, float %sign) { 658; SIVI-LABEL: v_test_copysign_f32: 659; SIVI: ; %bb.0: 660; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 661; SIVI-NEXT: s_brev_b32 s4, -2 662; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v1 663; SIVI-NEXT: s_setpc_b64 s[30:31] 664; 665; GFX11-LABEL: v_test_copysign_f32: 666; GFX11: ; %bb.0: 667; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 668; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 669; GFX11-NEXT: s_setpc_b64 s[30:31] 670 %result = call float @llvm.copysign.f32(float %mag, float %sign) 671 ret float %result 672} 673 674define float @v_test_copysign_f32_0(float %mag) { 675; SIVI-LABEL: v_test_copysign_f32_0: 676; SIVI: ; %bb.0: 677; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 678; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 679; SIVI-NEXT: s_setpc_b64 s[30:31] 680; 681; GFX11-LABEL: v_test_copysign_f32_0: 682; GFX11: ; %bb.0: 683; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 684; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 685; GFX11-NEXT: s_setpc_b64 s[30:31] 686 %result = call float @llvm.copysign.f32(float %mag, float 0.0) 687 ret float %result 688} 689 690define float @v_test_copysign_f32_1(float %mag) { 691; SIVI-LABEL: v_test_copysign_f32_1: 692; SIVI: ; %bb.0: 693; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 694; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 695; SIVI-NEXT: s_setpc_b64 s[30:31] 696; 697; GFX11-LABEL: v_test_copysign_f32_1: 698; GFX11: ; %bb.0: 699; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 700; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 701; GFX11-NEXT: s_setpc_b64 s[30:31] 702 %result = call float @llvm.copysign.f32(float %mag, float 1.0) 703 ret float %result 704} 705 706define float @v_test_copysign_f32_10(float %mag) { 707; SIVI-LABEL: v_test_copysign_f32_10: 708; SIVI: ; %bb.0: 709; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 710; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 711; SIVI-NEXT: s_setpc_b64 s[30:31] 712; 713; GFX11-LABEL: v_test_copysign_f32_10: 714; GFX11: ; %bb.0: 715; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 717; GFX11-NEXT: s_setpc_b64 s[30:31] 718 %result = call float @llvm.copysign.f32(float %mag, float 10.0) 719 ret float %result 720} 721 722define float @v_test_copysign_f32_neg1(float %mag) { 723; SIVI-LABEL: v_test_copysign_f32_neg1: 724; SIVI: ; %bb.0: 725; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 726; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 727; SIVI-NEXT: s_setpc_b64 s[30:31] 728; 729; GFX11-LABEL: v_test_copysign_f32_neg1: 730; GFX11: ; %bb.0: 731; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 732; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 733; GFX11-NEXT: s_setpc_b64 s[30:31] 734 %result = call float @llvm.copysign.f32(float %mag, float -1.0) 735 ret float %result 736} 737 738define float @v_test_copysign_f32_neg10(float %mag) { 739; SIVI-LABEL: v_test_copysign_f32_neg10: 740; SIVI: ; %bb.0: 741; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 743; SIVI-NEXT: s_setpc_b64 s[30:31] 744; 745; GFX11-LABEL: v_test_copysign_f32_neg10: 746; GFX11: ; %bb.0: 747; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 748; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 749; GFX11-NEXT: s_setpc_b64 s[30:31] 750 %result = call float @llvm.copysign.f32(float %mag, float -10.0) 751 ret float %result 752} 753 754define <2 x float> @v_test_copysign_v2f32(<2 x float> %mag, <2 x float> %sign) { 755; SIVI-LABEL: v_test_copysign_v2f32: 756; SIVI: ; %bb.0: 757; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 758; SIVI-NEXT: s_brev_b32 s4, -2 759; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v2 760; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v3 761; SIVI-NEXT: s_setpc_b64 s[30:31] 762; 763; GFX11-LABEL: v_test_copysign_v2f32: 764; GFX11: ; %bb.0: 765; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 767; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 768; GFX11-NEXT: s_setpc_b64 s[30:31] 769 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign) 770 ret <2 x float> %result 771} 772 773define <2 x float> @v_test_copysign_v2f32_0(<2 x float> %mag) { 774; SIVI-LABEL: v_test_copysign_v2f32_0: 775; SIVI: ; %bb.0: 776; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 777; SIVI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 778; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 779; SIVI-NEXT: s_setpc_b64 s[30:31] 780; 781; GFX11-LABEL: v_test_copysign_v2f32_0: 782; GFX11: ; %bb.0: 783; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 784; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 785; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 786; GFX11-NEXT: s_setpc_b64 s[30:31] 787 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> zeroinitializer) 788 ret <2 x float> %result 789} 790 791define <2 x float> @v_test_copysign_v2f32_neg1(<2 x float> %mag) { 792; SIVI-LABEL: v_test_copysign_v2f32_neg1: 793; SIVI: ; %bb.0: 794; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 795; SIVI-NEXT: v_or_b32_e32 v0, 0x80000000, v0 796; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v1 797; SIVI-NEXT: s_setpc_b64 s[30:31] 798; 799; GFX11-LABEL: v_test_copysign_v2f32_neg1: 800; GFX11: ; %bb.0: 801; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 802; GFX11-NEXT: v_or_b32_e32 v0, 0x80000000, v0 803; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v1 804; GFX11-NEXT: s_setpc_b64 s[30:31] 805 %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> <float -1.0, float -1.0>) 806 ret <2 x float> %result 807} 808 809define <3 x float> @v_test_copysign_v3f32(<3 x float> %mag, <3 x float> %sign) { 810; SIVI-LABEL: v_test_copysign_v3f32: 811; SIVI: ; %bb.0: 812; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 813; SIVI-NEXT: s_brev_b32 s4, -2 814; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v3 815; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v4 816; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v5 817; SIVI-NEXT: s_setpc_b64 s[30:31] 818; 819; GFX11-LABEL: v_test_copysign_v3f32: 820; GFX11: ; %bb.0: 821; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 822; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 823; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 824; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 825; GFX11-NEXT: s_setpc_b64 s[30:31] 826 %result = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign) 827 ret <3 x float> %result 828} 829 830define <4 x float> @v_test_copysign_v4f32(<4 x float> %mag, <4 x float> %sign) { 831; SIVI-LABEL: v_test_copysign_v4f32: 832; SIVI: ; %bb.0: 833; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 834; SIVI-NEXT: s_brev_b32 s4, -2 835; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v4 836; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v5 837; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v6 838; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v7 839; SIVI-NEXT: s_setpc_b64 s[30:31] 840; 841; GFX11-LABEL: v_test_copysign_v4f32: 842; GFX11: ; %bb.0: 843; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 844; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 845; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 846; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v6 847; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7 848; GFX11-NEXT: s_setpc_b64 s[30:31] 849 %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign) 850 ret <4 x float> %result 851} 852 853define <5 x float> @v_test_copysign_v5f32(<5 x float> %mag, <5 x float> %sign) { 854; SIVI-LABEL: v_test_copysign_v5f32: 855; SIVI: ; %bb.0: 856; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; SIVI-NEXT: s_brev_b32 s4, -2 858; SIVI-NEXT: v_bfi_b32 v0, s4, v0, v5 859; SIVI-NEXT: v_bfi_b32 v1, s4, v1, v6 860; SIVI-NEXT: v_bfi_b32 v2, s4, v2, v7 861; SIVI-NEXT: v_bfi_b32 v3, s4, v3, v8 862; SIVI-NEXT: v_bfi_b32 v4, s4, v4, v9 863; SIVI-NEXT: s_setpc_b64 s[30:31] 864; 865; GFX11-LABEL: v_test_copysign_v5f32: 866; GFX11: ; %bb.0: 867; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 868; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 869; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6 870; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 871; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 872; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v4, v9 873; GFX11-NEXT: s_setpc_b64 s[30:31] 874 %result = call <5 x float> @llvm.copysign.v5f32(<5 x float> %mag, <5 x float> %sign) 875 ret <5 x float> %result 876} 877 878define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out, float %mag, double %sign) { 879; SI-LABEL: s_test_copysign_f32_fptrunc_f64: 880; SI: ; %bb.0: 881; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 882; SI-NEXT: s_load_dword s6, s[4:5], 0xb 883; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 884; SI-NEXT: s_waitcnt lgkmcnt(0) 885; SI-NEXT: s_brev_b32 s4, -2 886; SI-NEXT: s_mov_b32 s3, 0xf000 887; SI-NEXT: s_mov_b32 s2, -1 888; SI-NEXT: v_mov_b32_e32 v0, s6 889; SI-NEXT: v_mov_b32_e32 v1, s5 890; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 891; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 892; SI-NEXT: s_endpgm 893; 894; VI-LABEL: s_test_copysign_f32_fptrunc_f64: 895; VI: ; %bb.0: 896; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 897; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 898; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 899; VI-NEXT: s_waitcnt lgkmcnt(0) 900; VI-NEXT: s_brev_b32 s0, -2 901; VI-NEXT: v_mov_b32_e32 v0, s6 902; VI-NEXT: v_mov_b32_e32 v1, s1 903; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 904; VI-NEXT: v_mov_b32_e32 v0, s2 905; VI-NEXT: v_mov_b32_e32 v1, s3 906; VI-NEXT: flat_store_dword v[0:1], v2 907; VI-NEXT: s_endpgm 908; 909; GFX11-LABEL: s_test_copysign_f32_fptrunc_f64: 910; GFX11: ; %bb.0: 911; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 912; GFX11-NEXT: s_waitcnt lgkmcnt(0) 913; GFX11-NEXT: s_clause 0x1 914; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c 915; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 916; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1 917; GFX11-NEXT: s_waitcnt lgkmcnt(0) 918; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 919; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 920; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] 921; GFX11-NEXT: s_endpgm 922 %sign.trunc = fptrunc double %sign to float 923 %result = call float @llvm.copysign.f32(float %mag, float %sign.trunc) 924 store float %result, ptr addrspace(1) %out, align 4 925 ret void 926} 927 928define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %out, double %sign) { 929; SI-LABEL: s_test_copysign_f32_1_fptrunc_f64: 930; SI: ; %bb.0: 931; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 932; SI-NEXT: s_mov_b32 s7, 0xf000 933; SI-NEXT: s_mov_b32 s6, -1 934; SI-NEXT: s_waitcnt lgkmcnt(0) 935; SI-NEXT: s_mov_b32 s4, s0 936; SI-NEXT: s_and_b32 s0, s3, 0x80000000 937; SI-NEXT: s_or_b32 s0, s0, 1.0 938; SI-NEXT: s_mov_b32 s5, s1 939; SI-NEXT: v_mov_b32_e32 v0, s0 940; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 941; SI-NEXT: s_endpgm 942; 943; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64: 944; VI: ; %bb.0: 945; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 946; VI-NEXT: s_waitcnt lgkmcnt(0) 947; VI-NEXT: v_mov_b32_e32 v0, s0 948; VI-NEXT: s_and_b32 s0, s3, 0x80000000 949; VI-NEXT: s_or_b32 s0, s0, 1.0 950; VI-NEXT: v_mov_b32_e32 v1, s1 951; VI-NEXT: v_mov_b32_e32 v2, s0 952; VI-NEXT: flat_store_dword v[0:1], v2 953; VI-NEXT: s_endpgm 954; 955; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64: 956; GFX11: ; %bb.0: 957; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 958; GFX11-NEXT: s_waitcnt lgkmcnt(0) 959; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 960; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 961; GFX11-NEXT: s_or_b32 s2, s2, 1.0 962; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 963; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 964; GFX11-NEXT: s_endpgm 965 %sign.trunc = fptrunc double %sign to float 966 %result = call float @llvm.copysign.f32(float 1.0, float %sign.trunc) 967 store float %result, ptr addrspace(1) %out, align 4 968 ret void 969} 970 971define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, float %mag, half %sign) { 972; SI-LABEL: s_test_copysign_f32_fpext_f16: 973; SI: ; %bb.0: 974; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 975; SI-NEXT: s_mov_b32 s7, 0xf000 976; SI-NEXT: s_mov_b32 s6, -1 977; SI-NEXT: s_waitcnt lgkmcnt(0) 978; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 979; SI-NEXT: s_mov_b32 s4, s0 980; SI-NEXT: s_brev_b32 s0, -2 981; SI-NEXT: v_mov_b32_e32 v1, s2 982; SI-NEXT: s_mov_b32 s5, s1 983; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 984; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 985; SI-NEXT: s_endpgm 986; 987; VI-LABEL: s_test_copysign_f32_fpext_f16: 988; VI: ; %bb.0: 989; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 990; VI-NEXT: s_brev_b32 s4, -2 991; VI-NEXT: s_waitcnt lgkmcnt(0) 992; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 993; VI-NEXT: v_mov_b32_e32 v1, s2 994; VI-NEXT: v_bfi_b32 v2, s4, v1, v0 995; VI-NEXT: v_mov_b32_e32 v0, s0 996; VI-NEXT: v_mov_b32_e32 v1, s1 997; VI-NEXT: flat_store_dword v[0:1], v2 998; VI-NEXT: s_endpgm 999; 1000; GFX11-LABEL: s_test_copysign_f32_fpext_f16: 1001; GFX11: ; %bb.0: 1002; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1003; GFX11-NEXT: v_mov_b32_e32 v1, 0 1004; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1005; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 1006; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1007; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 1008; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1009; GFX11-NEXT: s_endpgm 1010 %sign.ext = fpext half %sign to float 1011 %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) 1012 store float %result, ptr addrspace(1) %out, align 4 1013 ret void 1014} 1015 1016define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out, half %sign) { 1017; SI-LABEL: s_test_copysign_f32_1_fpext_f16: 1018; SI: ; %bb.0: 1019; SI-NEXT: s_load_dword s0, s[4:5], 0xb 1020; SI-NEXT: s_mov_b32 s3, 0xf000 1021; SI-NEXT: s_mov_b32 s2, -1 1022; SI-NEXT: s_waitcnt lgkmcnt(0) 1023; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 1024; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1025; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0 1026; SI-NEXT: v_or_b32_e32 v0, 1.0, v0 1027; SI-NEXT: s_waitcnt lgkmcnt(0) 1028; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1029; SI-NEXT: s_endpgm 1030; 1031; VI-LABEL: s_test_copysign_f32_1_fpext_f16: 1032; VI: ; %bb.0: 1033; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1034; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1035; VI-NEXT: s_waitcnt lgkmcnt(0) 1036; VI-NEXT: s_lshl_b32 s2, s2, 16 1037; VI-NEXT: s_and_b32 s2, s2, 0x80000000 1038; VI-NEXT: s_or_b32 s2, s2, 1.0 1039; VI-NEXT: v_mov_b32_e32 v0, s0 1040; VI-NEXT: v_mov_b32_e32 v1, s1 1041; VI-NEXT: v_mov_b32_e32 v2, s2 1042; VI-NEXT: flat_store_dword v[0:1], v2 1043; VI-NEXT: s_endpgm 1044; 1045; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16: 1046; GFX11: ; %bb.0: 1047; GFX11-NEXT: s_clause 0x1 1048; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 1049; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1050; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1051; GFX11-NEXT: s_lshl_b32 s2, s2, 16 1052; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1053; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000 1054; GFX11-NEXT: s_or_b32 s2, s2, 1.0 1055; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1056; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 1057; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1058; GFX11-NEXT: s_endpgm 1059 %sign.ext = fpext half %sign to float 1060 %result = call float @llvm.copysign.f32(float 1.0, float %sign.ext) 1061 store float %result, ptr addrspace(1) %out, align 4 1062 ret void 1063} 1064 1065define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, float %mag, bfloat %sign) { 1066; SI-LABEL: s_test_copysign_f32_fpext_bf16: 1067; SI: ; %bb.0: 1068; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1069; SI-NEXT: s_mov_b32 s7, 0xf000 1070; SI-NEXT: s_mov_b32 s6, -1 1071; SI-NEXT: s_waitcnt lgkmcnt(0) 1072; SI-NEXT: s_mov_b32 s4, s0 1073; SI-NEXT: s_lshl_b32 s0, s3, 16 1074; SI-NEXT: s_mov_b32 s5, s1 1075; SI-NEXT: s_brev_b32 s1, -2 1076; SI-NEXT: v_mov_b32_e32 v0, s2 1077; SI-NEXT: v_mov_b32_e32 v1, s0 1078; SI-NEXT: v_bfi_b32 v0, s1, v0, v1 1079; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1080; SI-NEXT: s_endpgm 1081; 1082; VI-LABEL: s_test_copysign_f32_fpext_bf16: 1083; VI: ; %bb.0: 1084; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1085; VI-NEXT: s_brev_b32 s4, -2 1086; VI-NEXT: s_waitcnt lgkmcnt(0) 1087; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 1088; VI-NEXT: v_mov_b32_e32 v1, s2 1089; VI-NEXT: v_bfi_b32 v2, s4, v1, v0 1090; VI-NEXT: v_mov_b32_e32 v0, s0 1091; VI-NEXT: v_mov_b32_e32 v1, s1 1092; VI-NEXT: flat_store_dword v[0:1], v2 1093; VI-NEXT: s_endpgm 1094; 1095; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: 1096; GFX11: ; %bb.0: 1097; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1098; GFX11-NEXT: v_mov_b32_e32 v1, 0 1099; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1100; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 1101; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1102; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 1103; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1104; GFX11-NEXT: s_endpgm 1105 %sign.ext = fpext bfloat %sign to float 1106 %result = call float @llvm.copysign.f32(float %mag, float %sign.ext) 1107 store float %result, ptr addrspace(1) %out, align 4 1108 ret void 1109} 1110 1111declare float @llvm.copysign.f32(float, float) #0 1112declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) #0 1113declare <3 x float> @llvm.copysign.v3f32(<3 x float>, <3 x float>) #0 1114declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) #0 1115declare <5 x float> @llvm.copysign.v5f32(<5 x float>, <5 x float>) #0 1116 1117attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 1118