1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SIVI,SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=SIVI,VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s 5 6declare double @llvm.copysign.f64(double, double) #0 7declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0 8declare <3 x double> @llvm.copysign.v3f64(<3 x double>, <3 x double>) #0 9declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) #0 10 11define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { 12; SI-LABEL: s_test_copysign_f64: 13; SI: ; %bb.0: 14; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 15; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 16; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x1d 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: s_brev_b32 s4, -2 19; SI-NEXT: s_mov_b32 s3, 0xf000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: v_mov_b32_e32 v0, s7 22; SI-NEXT: v_mov_b32_e32 v1, s5 23; SI-NEXT: v_bfi_b32 v1, s4, v0, v1 24; SI-NEXT: v_mov_b32_e32 v0, s6 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: s_test_copysign_f64: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 31; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x74 32; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_brev_b32 s2, -2 35; VI-NEXT: v_mov_b32_e32 v0, s1 36; VI-NEXT: v_mov_b32_e32 v1, s3 37; VI-NEXT: v_mov_b32_e32 v2, s4 38; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 39; VI-NEXT: v_mov_b32_e32 v0, s0 40; VI-NEXT: v_mov_b32_e32 v3, s5 41; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 42; VI-NEXT: s_endpgm 43; 44; GFX11-LABEL: s_test_copysign_f64: 45; GFX11: ; %bb.0: 46; GFX11-NEXT: s_clause 0x2 47; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x74 48; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x4c 49; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 50; GFX11-NEXT: v_mov_b32_e32 v2, 0 51; GFX11-NEXT: s_waitcnt lgkmcnt(0) 52; GFX11-NEXT: v_mov_b32_e32 v0, s1 53; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 54; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0 55; GFX11-NEXT: v_mov_b32_e32 v0, s2 56; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 57; GFX11-NEXT: s_endpgm 58 %result = call double @llvm.copysign.f64(double %mag, double %sign) 59 store double %result, ptr addrspace(1) %out, align 8 60 ret void 61} 62 63define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { 64; SI-LABEL: s_test_copysign_f64_0: 65; SI: ; %bb.0: 66; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 67; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 68; SI-NEXT: s_mov_b32 s3, 0xf000 69; SI-NEXT: s_mov_b32 s2, -1 70; SI-NEXT: s_waitcnt lgkmcnt(0) 71; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff 72; SI-NEXT: v_mov_b32_e32 v0, s6 73; SI-NEXT: v_mov_b32_e32 v1, s4 74; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 75; SI-NEXT: s_endpgm 76; 77; VI-LABEL: s_test_copysign_f64_0: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 80; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_bitset0_b32 s1, 31 83; VI-NEXT: v_mov_b32_e32 v2, s2 84; VI-NEXT: v_mov_b32_e32 v0, s0 85; VI-NEXT: v_mov_b32_e32 v1, s1 86; VI-NEXT: v_mov_b32_e32 v3, s3 87; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 88; VI-NEXT: s_endpgm 89; 90; GFX11-LABEL: s_test_copysign_f64_0: 91; GFX11: ; %bb.0: 92; GFX11-NEXT: s_clause 0x1 93; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 94; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 95; GFX11-NEXT: s_waitcnt lgkmcnt(0) 96; GFX11-NEXT: s_bitset0_b32 s1, 31 97; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 98; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 99; GFX11-NEXT: v_mov_b32_e32 v0, s0 100; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 101; GFX11-NEXT: s_endpgm 102 %result = call double @llvm.copysign.f64(double %mag, double 0.0) 103 store double %result, ptr addrspace(1) %out, align 8 104 ret void 105} 106 107define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { 108; SI-LABEL: s_test_copysign_f64_1: 109; SI: ; %bb.0: 110; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 111; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 112; SI-NEXT: s_mov_b32 s3, 0xf000 113; SI-NEXT: s_mov_b32 s2, -1 114; SI-NEXT: s_waitcnt lgkmcnt(0) 115; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff 116; SI-NEXT: v_mov_b32_e32 v0, s6 117; SI-NEXT: v_mov_b32_e32 v1, s4 118; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 119; SI-NEXT: s_endpgm 120; 121; VI-LABEL: s_test_copysign_f64_1: 122; VI: ; %bb.0: 123; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 124; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 125; VI-NEXT: s_waitcnt lgkmcnt(0) 126; VI-NEXT: s_bitset0_b32 s1, 31 127; VI-NEXT: v_mov_b32_e32 v2, s2 128; VI-NEXT: v_mov_b32_e32 v0, s0 129; VI-NEXT: v_mov_b32_e32 v1, s1 130; VI-NEXT: v_mov_b32_e32 v3, s3 131; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 132; VI-NEXT: s_endpgm 133; 134; GFX11-LABEL: s_test_copysign_f64_1: 135; GFX11: ; %bb.0: 136; GFX11-NEXT: s_clause 0x1 137; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 138; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 139; GFX11-NEXT: s_waitcnt lgkmcnt(0) 140; GFX11-NEXT: s_bitset0_b32 s1, 31 141; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 142; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 143; GFX11-NEXT: v_mov_b32_e32 v0, s0 144; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 145; GFX11-NEXT: s_endpgm 146 %result = call double @llvm.copysign.f64(double %mag, double 1.0) 147 store double %result, ptr addrspace(1) %out, align 8 148 ret void 149} 150 151define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { 152; SI-LABEL: s_test_copysign_f64_10: 153; SI: ; %bb.0: 154; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 155; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 156; SI-NEXT: s_mov_b32 s3, 0xf000 157; SI-NEXT: s_mov_b32 s2, -1 158; SI-NEXT: s_waitcnt lgkmcnt(0) 159; SI-NEXT: s_and_b32 s4, s7, 0x7fffffff 160; SI-NEXT: v_mov_b32_e32 v0, s6 161; SI-NEXT: v_mov_b32_e32 v1, s4 162; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 163; SI-NEXT: s_endpgm 164; 165; VI-LABEL: s_test_copysign_f64_10: 166; VI: ; %bb.0: 167; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 168; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 169; VI-NEXT: s_waitcnt lgkmcnt(0) 170; VI-NEXT: s_bitset0_b32 s1, 31 171; VI-NEXT: v_mov_b32_e32 v2, s2 172; VI-NEXT: v_mov_b32_e32 v0, s0 173; VI-NEXT: v_mov_b32_e32 v1, s1 174; VI-NEXT: v_mov_b32_e32 v3, s3 175; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 176; VI-NEXT: s_endpgm 177; 178; GFX11-LABEL: s_test_copysign_f64_10: 179; GFX11: ; %bb.0: 180; GFX11-NEXT: s_clause 0x1 181; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 182; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 183; GFX11-NEXT: s_waitcnt lgkmcnt(0) 184; GFX11-NEXT: s_bitset0_b32 s1, 31 185; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 186; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 187; GFX11-NEXT: v_mov_b32_e32 v0, s0 188; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 189; GFX11-NEXT: s_endpgm 190 %result = call double @llvm.copysign.f64(double %mag, double 10.0) 191 store double %result, ptr addrspace(1) %out, align 8 192 ret void 193} 194 195define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { 196; SI-LABEL: s_test_copysign_f64_neg1: 197; SI: ; %bb.0: 198; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 199; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 200; SI-NEXT: s_mov_b32 s3, 0xf000 201; SI-NEXT: s_mov_b32 s2, -1 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: s_or_b32 s4, s7, 0x80000000 204; SI-NEXT: v_mov_b32_e32 v0, s6 205; SI-NEXT: v_mov_b32_e32 v1, s4 206; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 207; SI-NEXT: s_endpgm 208; 209; VI-LABEL: s_test_copysign_f64_neg1: 210; VI: ; %bb.0: 211; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 212; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 213; VI-NEXT: s_waitcnt lgkmcnt(0) 214; VI-NEXT: s_bitset1_b32 s1, 31 215; VI-NEXT: v_mov_b32_e32 v2, s2 216; VI-NEXT: v_mov_b32_e32 v0, s0 217; VI-NEXT: v_mov_b32_e32 v1, s1 218; VI-NEXT: v_mov_b32_e32 v3, s3 219; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 220; VI-NEXT: s_endpgm 221; 222; GFX11-LABEL: s_test_copysign_f64_neg1: 223; GFX11: ; %bb.0: 224; GFX11-NEXT: s_clause 0x1 225; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 226; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 227; GFX11-NEXT: s_waitcnt lgkmcnt(0) 228; GFX11-NEXT: s_bitset1_b32 s1, 31 229; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 230; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 231; GFX11-NEXT: v_mov_b32_e32 v0, s0 232; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 233; GFX11-NEXT: s_endpgm 234 %result = call double @llvm.copysign.f64(double %mag, double -1.0) 235 store double %result, ptr addrspace(1) %out, align 8 236 ret void 237} 238 239define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { 240; SI-LABEL: s_test_copysign_f64_neg10: 241; SI: ; %bb.0: 242; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 243; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 244; SI-NEXT: s_mov_b32 s3, 0xf000 245; SI-NEXT: s_mov_b32 s2, -1 246; SI-NEXT: s_waitcnt lgkmcnt(0) 247; SI-NEXT: s_or_b32 s4, s7, 0x80000000 248; SI-NEXT: v_mov_b32_e32 v0, s6 249; SI-NEXT: v_mov_b32_e32 v1, s4 250; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 251; SI-NEXT: s_endpgm 252; 253; VI-LABEL: s_test_copysign_f64_neg10: 254; VI: ; %bb.0: 255; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 256; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 257; VI-NEXT: s_waitcnt lgkmcnt(0) 258; VI-NEXT: s_bitset1_b32 s1, 31 259; VI-NEXT: v_mov_b32_e32 v2, s2 260; VI-NEXT: v_mov_b32_e32 v0, s0 261; VI-NEXT: v_mov_b32_e32 v1, s1 262; VI-NEXT: v_mov_b32_e32 v3, s3 263; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 264; VI-NEXT: s_endpgm 265; 266; GFX11-LABEL: s_test_copysign_f64_neg10: 267; GFX11: ; %bb.0: 268; GFX11-NEXT: s_clause 0x1 269; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 270; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 271; GFX11-NEXT: s_waitcnt lgkmcnt(0) 272; GFX11-NEXT: s_bitset1_b32 s1, 31 273; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 274; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 275; GFX11-NEXT: v_mov_b32_e32 v0, s0 276; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 277; GFX11-NEXT: s_endpgm 278 %result = call double @llvm.copysign.f64(double %mag, double -10.0) 279 store double %result, ptr addrspace(1) %out, align 8 280 ret void 281} 282 283define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { 284; SI-LABEL: s_test_copysign_f64_f32: 285; SI: ; %bb.0: 286; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 287; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 288; SI-NEXT: s_load_dword s4, s[4:5], 0x1d 289; SI-NEXT: s_brev_b32 s5, -2 290; SI-NEXT: s_mov_b32 s3, 0xf000 291; SI-NEXT: s_mov_b32 s2, -1 292; SI-NEXT: s_waitcnt lgkmcnt(0) 293; SI-NEXT: v_mov_b32_e32 v0, s7 294; SI-NEXT: v_mov_b32_e32 v1, s4 295; SI-NEXT: v_bfi_b32 v1, s5, v0, v1 296; SI-NEXT: v_mov_b32_e32 v0, s6 297; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 298; SI-NEXT: s_endpgm 299; 300; VI-LABEL: s_test_copysign_f64_f32: 301; VI: ; %bb.0: 302; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 303; VI-NEXT: s_load_dword s6, s[4:5], 0x74 304; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 305; VI-NEXT: s_brev_b32 s4, -2 306; VI-NEXT: s_waitcnt lgkmcnt(0) 307; VI-NEXT: v_mov_b32_e32 v0, s1 308; VI-NEXT: v_mov_b32_e32 v1, s6 309; VI-NEXT: v_mov_b32_e32 v2, s2 310; VI-NEXT: v_bfi_b32 v1, s4, v0, v1 311; VI-NEXT: v_mov_b32_e32 v0, s0 312; VI-NEXT: v_mov_b32_e32 v3, s3 313; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 314; VI-NEXT: s_endpgm 315; 316; GFX11-LABEL: s_test_copysign_f64_f32: 317; GFX11: ; %bb.0: 318; GFX11-NEXT: s_clause 0x2 319; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x74 320; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 321; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 322; GFX11-NEXT: v_mov_b32_e32 v2, 0 323; GFX11-NEXT: s_waitcnt lgkmcnt(0) 324; GFX11-NEXT: v_mov_b32_e32 v0, s6 325; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 326; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 327; GFX11-NEXT: v_mov_b32_e32 v0, s0 328; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 329; GFX11-NEXT: s_endpgm 330 %sign.ext = fpext float %sign to double 331 %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) 332 store double %result, ptr addrspace(1) %out, align 8 333 ret void 334} 335 336define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { 337; SI-LABEL: s_test_copysign_f64_f16: 338; SI: ; %bb.0: 339; SI-NEXT: s_load_dword s2, s[4:5], 0x1d 340; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 341; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 342; SI-NEXT: s_brev_b32 s6, -2 343; SI-NEXT: s_mov_b32 s3, 0xf000 344; SI-NEXT: s_waitcnt lgkmcnt(0) 345; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 346; SI-NEXT: s_mov_b32 s2, -1 347; SI-NEXT: v_mov_b32_e32 v1, s5 348; SI-NEXT: v_bfi_b32 v1, s6, v1, v0 349; SI-NEXT: v_mov_b32_e32 v0, s4 350; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 351; SI-NEXT: s_endpgm 352; 353; VI-LABEL: s_test_copysign_f64_f16: 354; VI: ; %bb.0: 355; VI-NEXT: s_load_dword s6, s[4:5], 0x74 356; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 357; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 358; VI-NEXT: s_brev_b32 s4, -2 359; VI-NEXT: s_waitcnt lgkmcnt(0) 360; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6 361; VI-NEXT: v_mov_b32_e32 v1, s1 362; VI-NEXT: v_mov_b32_e32 v2, s2 363; VI-NEXT: v_bfi_b32 v1, s4, v1, v0 364; VI-NEXT: v_mov_b32_e32 v0, s0 365; VI-NEXT: v_mov_b32_e32 v3, s3 366; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 367; VI-NEXT: s_endpgm 368; 369; GFX11-LABEL: s_test_copysign_f64_f16: 370; GFX11: ; %bb.0: 371; GFX11-NEXT: s_clause 0x2 372; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x74 373; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 374; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 375; GFX11-NEXT: v_mov_b32_e32 v2, 0 376; GFX11-NEXT: s_waitcnt lgkmcnt(0) 377; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6 378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 379; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 380; GFX11-NEXT: v_mov_b32_e32 v0, s0 381; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] 382; GFX11-NEXT: s_endpgm 383 %sign.ext = fpext half %sign to double 384 %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) 385 store double %result, ptr addrspace(1) %out, align 8 386 ret void 387} 388 389define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, double %sign) { 390; SI-LABEL: s_test_copysign_f64_0_mag: 391; SI: ; %bb.0: 392; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 393; SI-NEXT: s_mov_b32 s7, 0xf000 394; SI-NEXT: s_mov_b32 s6, -1 395; SI-NEXT: v_mov_b32_e32 v0, 0 396; SI-NEXT: s_waitcnt lgkmcnt(0) 397; SI-NEXT: s_mov_b32 s4, s0 398; SI-NEXT: s_and_b32 s0, s3, 0x80000000 399; SI-NEXT: s_mov_b32 s5, s1 400; SI-NEXT: v_mov_b32_e32 v1, s0 401; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 402; SI-NEXT: s_endpgm 403; 404; VI-LABEL: s_test_copysign_f64_0_mag: 405; VI: ; %bb.0: 406; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 407; VI-NEXT: v_mov_b32_e32 v2, 0 408; VI-NEXT: s_waitcnt lgkmcnt(0) 409; VI-NEXT: v_mov_b32_e32 v0, s0 410; VI-NEXT: s_and_b32 s0, s3, 0x80000000 411; VI-NEXT: v_mov_b32_e32 v1, s1 412; VI-NEXT: v_mov_b32_e32 v3, s0 413; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 414; VI-NEXT: s_endpgm 415; 416; GFX11-LABEL: s_test_copysign_f64_0_mag: 417; GFX11: ; %bb.0: 418; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 419; GFX11-NEXT: s_waitcnt lgkmcnt(0) 420; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 421; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 422; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 423; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 424; GFX11-NEXT: s_endpgm 425 %result = call double @llvm.copysign.f64(double 0.0, double %sign) 426 store double %result, ptr addrspace(1) %out, align 4 427 ret void 428} 429 430define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, double %sign) { 431; SI-LABEL: s_test_copysign_f64_1_mag: 432; SI: ; %bb.0: 433; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 434; SI-NEXT: s_mov_b32 s7, 0xf000 435; SI-NEXT: s_mov_b32 s6, -1 436; SI-NEXT: v_mov_b32_e32 v0, 0 437; SI-NEXT: s_waitcnt lgkmcnt(0) 438; SI-NEXT: s_mov_b32 s4, s0 439; SI-NEXT: s_and_b32 s0, s3, 0x80000000 440; SI-NEXT: s_or_b32 s0, s0, 0x3ff00000 441; SI-NEXT: s_mov_b32 s5, s1 442; SI-NEXT: v_mov_b32_e32 v1, s0 443; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 444; SI-NEXT: s_endpgm 445; 446; VI-LABEL: s_test_copysign_f64_1_mag: 447; VI: ; %bb.0: 448; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 449; VI-NEXT: v_mov_b32_e32 v2, 0 450; VI-NEXT: s_waitcnt lgkmcnt(0) 451; VI-NEXT: v_mov_b32_e32 v0, s0 452; VI-NEXT: s_and_b32 s0, s3, 0x80000000 453; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000 454; VI-NEXT: v_mov_b32_e32 v1, s1 455; VI-NEXT: v_mov_b32_e32 v3, s0 456; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 457; VI-NEXT: s_endpgm 458; 459; GFX11-LABEL: s_test_copysign_f64_1_mag: 460; GFX11: ; %bb.0: 461; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 462; GFX11-NEXT: s_waitcnt lgkmcnt(0) 463; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 464; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 465; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 466; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 467; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 468; GFX11-NEXT: s_endpgm 469 %result = call double @llvm.copysign.f64(double 1.0, double %sign) 470 store double %result, ptr addrspace(1) %out, align 4 471 ret void 472} 473 474define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, double %sign) { 475; SI-LABEL: s_test_copysign_f64_10_mag: 476; SI: ; %bb.0: 477; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 478; SI-NEXT: s_mov_b32 s7, 0xf000 479; SI-NEXT: s_mov_b32 s6, -1 480; SI-NEXT: v_mov_b32_e32 v0, 0 481; SI-NEXT: s_waitcnt lgkmcnt(0) 482; SI-NEXT: s_mov_b32 s4, s0 483; SI-NEXT: s_and_b32 s0, s3, 0x80000000 484; SI-NEXT: s_or_b32 s0, s0, 0x40240000 485; SI-NEXT: s_mov_b32 s5, s1 486; SI-NEXT: v_mov_b32_e32 v1, s0 487; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 488; SI-NEXT: s_endpgm 489; 490; VI-LABEL: s_test_copysign_f64_10_mag: 491; VI: ; %bb.0: 492; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 493; VI-NEXT: v_mov_b32_e32 v2, 0 494; VI-NEXT: s_waitcnt lgkmcnt(0) 495; VI-NEXT: v_mov_b32_e32 v0, s0 496; VI-NEXT: s_and_b32 s0, s3, 0x80000000 497; VI-NEXT: s_or_b32 s0, s0, 0x40240000 498; VI-NEXT: v_mov_b32_e32 v1, s1 499; VI-NEXT: v_mov_b32_e32 v3, s0 500; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 501; VI-NEXT: s_endpgm 502; 503; GFX11-LABEL: s_test_copysign_f64_10_mag: 504; GFX11: ; %bb.0: 505; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 506; GFX11-NEXT: s_waitcnt lgkmcnt(0) 507; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 508; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 509; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 510; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 511; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 512; GFX11-NEXT: s_endpgm 513 %result = call double @llvm.copysign.f64(double 10.0, double %sign) 514 store double %result, ptr addrspace(1) %out, align 4 515 ret void 516} 517 518define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, double %sign) { 519; SI-LABEL: s_test_copysign_f64_neg1_mag: 520; SI: ; %bb.0: 521; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 522; SI-NEXT: s_mov_b32 s7, 0xf000 523; SI-NEXT: s_mov_b32 s6, -1 524; SI-NEXT: v_mov_b32_e32 v0, 0 525; SI-NEXT: s_waitcnt lgkmcnt(0) 526; SI-NEXT: s_mov_b32 s4, s0 527; SI-NEXT: s_and_b32 s0, s3, 0x80000000 528; SI-NEXT: s_or_b32 s0, s0, 0x3ff00000 529; SI-NEXT: s_mov_b32 s5, s1 530; SI-NEXT: v_mov_b32_e32 v1, s0 531; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 532; SI-NEXT: s_endpgm 533; 534; VI-LABEL: s_test_copysign_f64_neg1_mag: 535; VI: ; %bb.0: 536; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 537; VI-NEXT: v_mov_b32_e32 v2, 0 538; VI-NEXT: s_waitcnt lgkmcnt(0) 539; VI-NEXT: v_mov_b32_e32 v0, s0 540; VI-NEXT: s_and_b32 s0, s3, 0x80000000 541; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000 542; VI-NEXT: v_mov_b32_e32 v1, s1 543; VI-NEXT: v_mov_b32_e32 v3, s0 544; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 545; VI-NEXT: s_endpgm 546; 547; GFX11-LABEL: s_test_copysign_f64_neg1_mag: 548; GFX11: ; %bb.0: 549; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 550; GFX11-NEXT: s_waitcnt lgkmcnt(0) 551; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 552; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 553; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000 554; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 555; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 556; GFX11-NEXT: s_endpgm 557 %result = call double @llvm.copysign.f64(double -1.0, double %sign) 558 store double %result, ptr addrspace(1) %out, align 4 559 ret void 560} 561 562define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out, double %sign) { 563; SI-LABEL: s_test_copysign_f64_neg10_mag: 564; SI: ; %bb.0: 565; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 566; SI-NEXT: s_mov_b32 s7, 0xf000 567; SI-NEXT: s_mov_b32 s6, -1 568; SI-NEXT: v_mov_b32_e32 v0, 0 569; SI-NEXT: s_waitcnt lgkmcnt(0) 570; SI-NEXT: s_mov_b32 s4, s0 571; SI-NEXT: s_and_b32 s0, s3, 0x80000000 572; SI-NEXT: s_or_b32 s0, s0, 0x40240000 573; SI-NEXT: s_mov_b32 s5, s1 574; SI-NEXT: v_mov_b32_e32 v1, s0 575; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 576; SI-NEXT: s_endpgm 577; 578; VI-LABEL: s_test_copysign_f64_neg10_mag: 579; VI: ; %bb.0: 580; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 581; VI-NEXT: v_mov_b32_e32 v2, 0 582; VI-NEXT: s_waitcnt lgkmcnt(0) 583; VI-NEXT: v_mov_b32_e32 v0, s0 584; VI-NEXT: s_and_b32 s0, s3, 0x80000000 585; VI-NEXT: s_or_b32 s0, s0, 0x40240000 586; VI-NEXT: v_mov_b32_e32 v1, s1 587; VI-NEXT: v_mov_b32_e32 v3, s0 588; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 589; VI-NEXT: s_endpgm 590; 591; GFX11-LABEL: s_test_copysign_f64_neg10_mag: 592; GFX11: ; %bb.0: 593; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 594; GFX11-NEXT: s_waitcnt lgkmcnt(0) 595; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000 596; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 597; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000 598; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 599; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 600; GFX11-NEXT: s_endpgm 601 %result = call double @llvm.copysign.f64(double -10.0, double %sign) 602 store double %result, ptr addrspace(1) %out, align 4 603 ret void 604} 605 606define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { 607; SI-LABEL: s_test_copysign_v2f64: 608; SI: ; %bb.0: 609; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 610; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 611; SI-NEXT: s_brev_b32 s6, -2 612; SI-NEXT: s_mov_b32 s3, 0xf000 613; SI-NEXT: s_mov_b32 s2, -1 614; SI-NEXT: s_waitcnt lgkmcnt(0) 615; SI-NEXT: v_mov_b32_e32 v0, s11 616; SI-NEXT: v_mov_b32_e32 v1, s15 617; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 618; SI-NEXT: v_mov_b32_e32 v0, s9 619; SI-NEXT: v_mov_b32_e32 v1, s13 620; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 621; SI-NEXT: v_mov_b32_e32 v0, s8 622; SI-NEXT: v_mov_b32_e32 v2, s10 623; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 624; SI-NEXT: s_endpgm 625; 626; VI-LABEL: s_test_copysign_v2f64: 627; VI: ; %bb.0: 628; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 629; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 630; VI-NEXT: s_brev_b32 s2, -2 631; VI-NEXT: s_waitcnt lgkmcnt(0) 632; VI-NEXT: v_mov_b32_e32 v0, s11 633; VI-NEXT: v_mov_b32_e32 v1, s15 634; VI-NEXT: v_mov_b32_e32 v2, s9 635; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 636; VI-NEXT: v_mov_b32_e32 v0, s13 637; VI-NEXT: v_mov_b32_e32 v5, s1 638; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 639; VI-NEXT: v_mov_b32_e32 v0, s8 640; VI-NEXT: v_mov_b32_e32 v2, s10 641; VI-NEXT: v_mov_b32_e32 v4, s0 642; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 643; VI-NEXT: s_endpgm 644; 645; GFX11-LABEL: s_test_copysign_v2f64: 646; GFX11: ; %bb.0: 647; GFX11-NEXT: s_clause 0x1 648; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 649; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 650; GFX11-NEXT: s_waitcnt lgkmcnt(0) 651; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s15 652; GFX11-NEXT: v_mov_b32_e32 v2, s13 653; GFX11-NEXT: v_mov_b32_e32 v0, s8 654; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 655; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v1 656; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v2 657; GFX11-NEXT: v_mov_b32_e32 v2, s10 658; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 659; GFX11-NEXT: s_endpgm 660 %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) 661 store <2 x double> %result, ptr addrspace(1) %out, align 16 662 ret void 663} 664 665define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { 666; SI-LABEL: s_test_copysign_v3f64: 667; SI: ; %bb.0: 668; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 669; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 670; SI-NEXT: s_brev_b32 s6, -2 671; SI-NEXT: s_mov_b32 s3, 0xf000 672; SI-NEXT: s_mov_b32 s2, -1 673; SI-NEXT: s_waitcnt lgkmcnt(0) 674; SI-NEXT: v_mov_b32_e32 v0, s11 675; SI-NEXT: v_mov_b32_e32 v1, s19 676; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 677; SI-NEXT: v_mov_b32_e32 v0, s9 678; SI-NEXT: v_mov_b32_e32 v1, s17 679; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 680; SI-NEXT: v_mov_b32_e32 v0, s13 681; SI-NEXT: v_mov_b32_e32 v2, s21 682; SI-NEXT: v_bfi_b32 v5, s6, v0, v2 683; SI-NEXT: v_mov_b32_e32 v4, s12 684; SI-NEXT: v_mov_b32_e32 v0, s8 685; SI-NEXT: v_mov_b32_e32 v2, s10 686; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 687; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 688; SI-NEXT: s_endpgm 689; 690; VI-LABEL: s_test_copysign_v3f64: 691; VI: ; %bb.0: 692; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 693; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 694; VI-NEXT: s_brev_b32 s2, -2 695; VI-NEXT: s_waitcnt lgkmcnt(0) 696; VI-NEXT: v_mov_b32_e32 v0, s11 697; VI-NEXT: v_mov_b32_e32 v1, s19 698; VI-NEXT: v_mov_b32_e32 v2, s9 699; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 700; VI-NEXT: v_mov_b32_e32 v0, s17 701; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 702; VI-NEXT: v_mov_b32_e32 v0, s13 703; VI-NEXT: v_mov_b32_e32 v2, s21 704; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 705; VI-NEXT: s_add_u32 s2, s0, 16 706; VI-NEXT: s_addc_u32 s3, s1, 0 707; VI-NEXT: v_mov_b32_e32 v7, s3 708; VI-NEXT: v_mov_b32_e32 v4, s12 709; VI-NEXT: v_mov_b32_e32 v6, s2 710; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 711; VI-NEXT: v_mov_b32_e32 v5, s1 712; VI-NEXT: v_mov_b32_e32 v0, s8 713; VI-NEXT: v_mov_b32_e32 v2, s10 714; VI-NEXT: v_mov_b32_e32 v4, s0 715; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 716; VI-NEXT: s_endpgm 717; 718; GFX11-LABEL: s_test_copysign_v3f64: 719; GFX11: ; %bb.0: 720; GFX11-NEXT: s_clause 0x1 721; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 722; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 723; GFX11-NEXT: s_waitcnt lgkmcnt(0) 724; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s19 725; GFX11-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v0, s8 726; GFX11-NEXT: v_dual_mov_b32 v7, s17 :: v_dual_mov_b32 v4, s12 727; GFX11-NEXT: v_mov_b32_e32 v2, s10 728; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 729; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s13, v5 730; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s11, v1 731; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s9, v7 732; GFX11-NEXT: s_clause 0x1 733; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 734; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] 735; GFX11-NEXT: s_endpgm 736 %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) 737 store <3 x double> %result, ptr addrspace(1) %out, align 32 738 ret void 739} 740 741define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { 742; SI-LABEL: s_test_copysign_v4f64: 743; SI: ; %bb.0: 744; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 745; SI-NEXT: s_brev_b32 s6, -2 746; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 747; SI-NEXT: s_mov_b32 s3, 0xf000 748; SI-NEXT: s_mov_b32 s2, -1 749; SI-NEXT: s_waitcnt lgkmcnt(0) 750; SI-NEXT: v_mov_b32_e32 v0, s11 751; SI-NEXT: v_mov_b32_e32 v1, s19 752; SI-NEXT: v_bfi_b32 v3, s6, v0, v1 753; SI-NEXT: v_mov_b32_e32 v0, s9 754; SI-NEXT: v_mov_b32_e32 v1, s17 755; SI-NEXT: v_bfi_b32 v1, s6, v0, v1 756; SI-NEXT: v_mov_b32_e32 v0, s15 757; SI-NEXT: v_mov_b32_e32 v2, s23 758; SI-NEXT: v_bfi_b32 v7, s6, v0, v2 759; SI-NEXT: v_mov_b32_e32 v0, s13 760; SI-NEXT: v_mov_b32_e32 v2, s21 761; SI-NEXT: v_bfi_b32 v5, s6, v0, v2 762; SI-NEXT: v_mov_b32_e32 v4, s12 763; SI-NEXT: v_mov_b32_e32 v6, s14 764; SI-NEXT: v_mov_b32_e32 v0, s8 765; SI-NEXT: v_mov_b32_e32 v2, s10 766; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 767; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 768; SI-NEXT: s_endpgm 769; 770; VI-LABEL: s_test_copysign_v4f64: 771; VI: ; %bb.0: 772; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 773; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 774; VI-NEXT: s_brev_b32 s2, -2 775; VI-NEXT: s_waitcnt lgkmcnt(0) 776; VI-NEXT: v_mov_b32_e32 v0, s11 777; VI-NEXT: v_mov_b32_e32 v1, s19 778; VI-NEXT: v_mov_b32_e32 v2, s9 779; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 780; VI-NEXT: v_mov_b32_e32 v0, s17 781; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 782; VI-NEXT: v_mov_b32_e32 v0, s15 783; VI-NEXT: v_mov_b32_e32 v2, s23 784; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 785; VI-NEXT: v_mov_b32_e32 v0, s13 786; VI-NEXT: v_mov_b32_e32 v2, s21 787; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 788; VI-NEXT: s_add_u32 s2, s0, 16 789; VI-NEXT: s_addc_u32 s3, s1, 0 790; VI-NEXT: v_mov_b32_e32 v9, s3 791; VI-NEXT: v_mov_b32_e32 v4, s12 792; VI-NEXT: v_mov_b32_e32 v6, s14 793; VI-NEXT: v_mov_b32_e32 v8, s2 794; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 795; VI-NEXT: v_mov_b32_e32 v0, s8 796; VI-NEXT: v_mov_b32_e32 v5, s1 797; VI-NEXT: v_mov_b32_e32 v2, s10 798; VI-NEXT: v_mov_b32_e32 v4, s0 799; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 800; VI-NEXT: s_endpgm 801; 802; GFX11-LABEL: s_test_copysign_v4f64: 803; GFX11: ; %bb.0: 804; GFX11-NEXT: s_clause 0x1 805; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 806; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 807; GFX11-NEXT: s_waitcnt lgkmcnt(0) 808; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s19 809; GFX11-NEXT: v_dual_mov_b32 v3, s23 :: v_dual_mov_b32 v2, s14 810; GFX11-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v4, s8 811; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s12 812; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 813; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, s11, v1 814; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s15, v3 815; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 816; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s13, v9 817; GFX11-NEXT: v_mov_b32_e32 v6, s10 818; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s9, v5 819; GFX11-NEXT: s_clause 0x1 820; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 821; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] 822; GFX11-NEXT: s_endpgm 823 %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) 824 store <4 x double> %result, ptr addrspace(1) %out, align 32 825 ret void 826} 827 828define double @v_test_copysign_f64(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], double %sign) { 829; SIVI-LABEL: v_test_copysign_f64: 830; SIVI: ; %bb.0: 831; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 832; SIVI-NEXT: s_brev_b32 s4, -2 833; SIVI-NEXT: v_mov_b32_e32 v0, v10 834; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v21 835; SIVI-NEXT: s_setpc_b64 s[30:31] 836; 837; GFX11-LABEL: v_test_copysign_f64: 838; GFX11: ; %bb.0: 839; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 840; GFX11-NEXT: v_mov_b32_e32 v0, v10 841; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v21 842; GFX11-NEXT: s_setpc_b64 s[30:31] 843 %result = call double @llvm.copysign.f64(double %mag, double %sign) 844 ret double %result 845} 846 847define double @v_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32], double %mag) { 848; SIVI-LABEL: v_test_copysign_f64_0: 849; SIVI: ; %bb.0: 850; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 851; SIVI-NEXT: v_mov_b32_e32 v0, v10 852; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 853; SIVI-NEXT: s_setpc_b64 s[30:31] 854; 855; GFX11-LABEL: v_test_copysign_f64_0: 856; GFX11: ; %bb.0: 857; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 858; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 859; GFX11-NEXT: s_setpc_b64 s[30:31] 860 %result = call double @llvm.copysign.f64(double %mag, double 0.0) 861 ret double %result 862} 863 864define double @v_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32], double %mag) { 865; SIVI-LABEL: v_test_copysign_f64_1: 866; SIVI: ; %bb.0: 867; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 868; SIVI-NEXT: v_mov_b32_e32 v0, v10 869; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 870; SIVI-NEXT: s_setpc_b64 s[30:31] 871; 872; GFX11-LABEL: v_test_copysign_f64_1: 873; GFX11: ; %bb.0: 874; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 875; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 876; GFX11-NEXT: s_setpc_b64 s[30:31] 877 %result = call double @llvm.copysign.f64(double %mag, double 1.0) 878 ret double %result 879} 880 881define double @v_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i32], double %mag) { 882; SIVI-LABEL: v_test_copysign_f64_10: 883; SIVI: ; %bb.0: 884; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 885; SIVI-NEXT: v_mov_b32_e32 v0, v10 886; SIVI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v11 887; SIVI-NEXT: s_setpc_b64 s[30:31] 888; 889; GFX11-LABEL: v_test_copysign_f64_10: 890; GFX11: ; %bb.0: 891; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 892; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_and_b32 v1, 0x7fffffff, v11 893; GFX11-NEXT: s_setpc_b64 s[30:31] 894 %result = call double @llvm.copysign.f64(double %mag, double 10.0) 895 ret double %result 896} 897 898define double @v_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x i32], double %mag) { 899; SIVI-LABEL: v_test_copysign_f64_neg1: 900; SIVI: ; %bb.0: 901; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 902; SIVI-NEXT: v_mov_b32_e32 v0, v10 903; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11 904; SIVI-NEXT: s_setpc_b64 s[30:31] 905; 906; GFX11-LABEL: v_test_copysign_f64_neg1: 907; GFX11: ; %bb.0: 908; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 909; GFX11-NEXT: v_mov_b32_e32 v0, v10 910; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11 911; GFX11-NEXT: s_setpc_b64 s[30:31] 912 %result = call double @llvm.copysign.f64(double %mag, double -1.0) 913 ret double %result 914} 915 916define double @v_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x i32], double %mag) { 917; SIVI-LABEL: v_test_copysign_f64_neg10: 918; SIVI: ; %bb.0: 919; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 920; SIVI-NEXT: v_mov_b32_e32 v0, v10 921; SIVI-NEXT: v_or_b32_e32 v1, 0x80000000, v11 922; SIVI-NEXT: s_setpc_b64 s[30:31] 923; 924; GFX11-LABEL: v_test_copysign_f64_neg10: 925; GFX11: ; %bb.0: 926; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX11-NEXT: v_mov_b32_e32 v0, v10 928; GFX11-NEXT: v_or_b32_e32 v1, 0x80000000, v11 929; GFX11-NEXT: s_setpc_b64 s[30:31] 930 %result = call double @llvm.copysign.f64(double %mag, double -10.0) 931 ret double %result 932} 933 934define double @v_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], float %sign) { 935; SIVI-LABEL: v_test_copysign_f64_f32: 936; SIVI: ; %bb.0: 937; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 938; SIVI-NEXT: s_brev_b32 s4, -2 939; SIVI-NEXT: v_mov_b32_e32 v0, v10 940; SIVI-NEXT: v_bfi_b32 v1, s4, v11, v20 941; SIVI-NEXT: s_setpc_b64 s[30:31] 942; 943; GFX11-LABEL: v_test_copysign_f64_f32: 944; GFX11: ; %bb.0: 945; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 946; GFX11-NEXT: v_mov_b32_e32 v0, v10 947; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v20 948; GFX11-NEXT: s_setpc_b64 s[30:31] 949 %sign.ext = fpext float %sign to double 950 %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) 951 ret double %result 952} 953 954define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double %mag, [8 x i32], half %sign) { 955; SI-LABEL: v_test_copysign_f64_f16: 956; SI: ; %bb.0: 957; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 958; SI-NEXT: s_brev_b32 s4, -2 959; SI-NEXT: v_mov_b32_e32 v0, v10 960; SI-NEXT: v_bfi_b32 v1, s4, v11, v20 961; SI-NEXT: s_setpc_b64 s[30:31] 962; 963; VI-LABEL: v_test_copysign_f64_f16: 964; VI: ; %bb.0: 965; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 966; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v20 967; VI-NEXT: s_brev_b32 s4, -2 968; VI-NEXT: v_mov_b32_e32 v0, v10 969; VI-NEXT: v_bfi_b32 v1, s4, v11, v1 970; VI-NEXT: s_setpc_b64 s[30:31] 971; 972; GFX11-LABEL: v_test_copysign_f64_f16: 973; GFX11: ; %bb.0: 974; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 975; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20 976; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 977; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v1 978; GFX11-NEXT: s_setpc_b64 s[30:31] 979 %sign.ext = fpext half %sign to double 980 %result = call double @llvm.copysign.f64(double %mag, double %sign.ext) 981 ret double %result 982} 983 984define <2 x double> @v_test_copysign_v2f64(ptr addrspace(1) %out, <2 x double> %mag, <2 x double> %sign) { 985; SIVI-LABEL: v_test_copysign_v2f64: 986; SIVI: ; %bb.0: 987; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 988; SIVI-NEXT: s_brev_b32 s4, -2 989; SIVI-NEXT: v_mov_b32_e32 v0, v2 990; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v7 991; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v9 992; SIVI-NEXT: v_mov_b32_e32 v2, v4 993; SIVI-NEXT: s_setpc_b64 s[30:31] 994; 995; GFX11-LABEL: v_test_copysign_v2f64: 996; GFX11: ; %bb.0: 997; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 998; GFX11-NEXT: v_mov_b32_e32 v0, v2 999; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v7 1000; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v9 1001; GFX11-NEXT: v_mov_b32_e32 v2, v4 1002; GFX11-NEXT: s_setpc_b64 s[30:31] 1003 %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign) 1004 ret <2 x double> %result 1005} 1006 1007define <3 x double> @v_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { 1008; SIVI-LABEL: v_test_copysign_v3f64: 1009; SIVI: ; %bb.0: 1010; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1011; SIVI-NEXT: s_brev_b32 s4, -2 1012; SIVI-NEXT: v_mov_b32_e32 v0, v2 1013; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v9 1014; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v11 1015; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v13 1016; SIVI-NEXT: v_mov_b32_e32 v2, v4 1017; SIVI-NEXT: v_mov_b32_e32 v4, v6 1018; SIVI-NEXT: s_setpc_b64 s[30:31] 1019; 1020; GFX11-LABEL: v_test_copysign_v3f64: 1021; GFX11: ; %bb.0: 1022; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1023; GFX11-NEXT: v_mov_b32_e32 v0, v2 1024; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v9 1025; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v11 1026; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v13 1027; GFX11-NEXT: v_mov_b32_e32 v2, v4 1028; GFX11-NEXT: v_mov_b32_e32 v4, v6 1029; GFX11-NEXT: s_setpc_b64 s[30:31] 1030 %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) 1031 ret <3 x double> %result 1032} 1033 1034define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { 1035; SIVI-LABEL: v_test_copysign_v4f64: 1036; SIVI: ; %bb.0: 1037; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1038; SIVI-NEXT: s_brev_b32 s4, -2 1039; SIVI-NEXT: v_mov_b32_e32 v0, v2 1040; SIVI-NEXT: v_bfi_b32 v1, s4, v3, v11 1041; SIVI-NEXT: v_bfi_b32 v3, s4, v5, v13 1042; SIVI-NEXT: v_bfi_b32 v5, s4, v7, v15 1043; SIVI-NEXT: v_bfi_b32 v7, s4, v9, v17 1044; SIVI-NEXT: v_mov_b32_e32 v2, v4 1045; SIVI-NEXT: v_mov_b32_e32 v4, v6 1046; SIVI-NEXT: v_mov_b32_e32 v6, v8 1047; SIVI-NEXT: s_setpc_b64 s[30:31] 1048; 1049; GFX11-LABEL: v_test_copysign_v4f64: 1050; GFX11: ; %bb.0: 1051; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1052; GFX11-NEXT: v_mov_b32_e32 v0, v2 1053; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v11 1054; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v5, v13 1055; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v7, v15 1056; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v9, v17 1057; GFX11-NEXT: v_mov_b32_e32 v2, v4 1058; GFX11-NEXT: v_mov_b32_e32 v4, v6 1059; GFX11-NEXT: v_mov_b32_e32 v6, v8 1060; GFX11-NEXT: s_setpc_b64 s[30:31] 1061 %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign) 1062 ret <4 x double> %result 1063} 1064 1065attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 1066