1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s 6 7declare half @llvm.copysign.f16(half, half) #0 8declare float @llvm.copysign.f32(float, float) #0 9declare double @llvm.copysign.f64(double, double) #0 10declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) #0 11declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0 12declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0 13declare i32 @llvm.amdgcn.workitem.id.x() #0 14 15define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, half %sign) { 16; SI-LABEL: s_copysign_f16: 17; SI: ; %bb.0: 18; SI-NEXT: s_load_dword s0, s[4:5], 0xb 19; SI-NEXT: s_brev_b32 s2, -2 20; SI-NEXT: s_mov_b32 s3, 0xf000 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 23; SI-NEXT: s_lshr_b32 s0, s0, 16 24; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 25; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 26; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 27; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 28; SI-NEXT: s_mov_b32 s2, -1 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 31; SI-NEXT: s_endpgm 32; 33; VI-LABEL: s_copysign_f16: 34; VI: ; %bb.0: 35; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 36; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 37; VI-NEXT: s_movk_i32 s3, 0x7fff 38; VI-NEXT: s_waitcnt lgkmcnt(0) 39; VI-NEXT: s_lshr_b32 s4, s2, 16 40; VI-NEXT: v_mov_b32_e32 v0, s2 41; VI-NEXT: v_mov_b32_e32 v1, s4 42; VI-NEXT: v_bfi_b32 v2, s3, v0, v1 43; VI-NEXT: v_mov_b32_e32 v0, s0 44; VI-NEXT: v_mov_b32_e32 v1, s1 45; VI-NEXT: flat_store_short v[0:1], v2 46; VI-NEXT: s_endpgm 47; 48; GFX9-LABEL: s_copysign_f16: 49; GFX9: ; %bb.0: 50; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 51; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 52; GFX9-NEXT: s_movk_i32 s3, 0x7fff 53; GFX9-NEXT: v_mov_b32_e32 v0, 0 54; GFX9-NEXT: s_waitcnt lgkmcnt(0) 55; GFX9-NEXT: s_lshr_b32 s4, s2, 16 56; GFX9-NEXT: v_mov_b32_e32 v1, s2 57; GFX9-NEXT: v_mov_b32_e32 v2, s4 58; GFX9-NEXT: v_bfi_b32 v1, s3, v1, v2 59; GFX9-NEXT: global_store_short v0, v1, s[0:1] 60; GFX9-NEXT: s_endpgm 61; 62; GFX11-LABEL: s_copysign_f16: 63; GFX11: ; %bb.0: 64; GFX11-NEXT: s_clause 0x1 65; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 66; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 67; GFX11-NEXT: v_mov_b32_e32 v1, 0 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: s_lshr_b32 s3, s2, 16 70; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 71; GFX11-NEXT: v_mov_b32_e32 v0, s3 72; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 73; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 74; GFX11-NEXT: s_endpgm 75 %out = call half @llvm.copysign.f16(half %mag, half %sign) 76 store half %out, ptr addrspace(1) %arg_out 77 ret void 78} 79 80define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %mag) { 81; SI-LABEL: s_test_copysign_f16_0: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dword s6, s[4:5], 0xb 84; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 85; SI-NEXT: s_mov_b32 s3, 0xf000 86; SI-NEXT: s_mov_b32 s2, -1 87; SI-NEXT: s_waitcnt lgkmcnt(0) 88; SI-NEXT: s_and_b32 s4, s6, 0x7fff 89; SI-NEXT: v_mov_b32_e32 v0, s4 90; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: s_test_copysign_f16_0: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 96; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 97; VI-NEXT: s_waitcnt lgkmcnt(0) 98; VI-NEXT: s_and_b32 s2, s2, 0x7fff 99; VI-NEXT: v_mov_b32_e32 v0, s0 100; VI-NEXT: v_mov_b32_e32 v1, s1 101; VI-NEXT: v_mov_b32_e32 v2, s2 102; VI-NEXT: flat_store_short v[0:1], v2 103; VI-NEXT: s_endpgm 104; 105; GFX9-LABEL: s_test_copysign_f16_0: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 109; GFX9-NEXT: v_mov_b32_e32 v0, 0 110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 111; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 112; GFX9-NEXT: v_mov_b32_e32 v1, s2 113; GFX9-NEXT: global_store_short v0, v1, s[0:1] 114; GFX9-NEXT: s_endpgm 115; 116; GFX11-LABEL: s_test_copysign_f16_0: 117; GFX11: ; %bb.0: 118; GFX11-NEXT: s_clause 0x1 119; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 120; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 121; GFX11-NEXT: s_waitcnt lgkmcnt(0) 122; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff 123; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 124; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 125; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 126; GFX11-NEXT: s_endpgm 127 %result = call half @llvm.copysign.f16(half %mag, half 0.0) 128 store half %result, ptr addrspace(1) %out, align 4 129 ret void 130} 131 132define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %mag) { 133; SI-LABEL: s_test_copysign_f16_1: 134; SI: ; %bb.0: 135; SI-NEXT: s_load_dword s6, s[4:5], 0xb 136; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 137; SI-NEXT: s_mov_b32 s3, 0xf000 138; SI-NEXT: s_mov_b32 s2, -1 139; SI-NEXT: s_waitcnt lgkmcnt(0) 140; SI-NEXT: s_and_b32 s4, s6, 0x7fff 141; SI-NEXT: v_mov_b32_e32 v0, s4 142; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 143; SI-NEXT: s_endpgm 144; 145; VI-LABEL: s_test_copysign_f16_1: 146; VI: ; %bb.0: 147; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 148; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 149; VI-NEXT: s_waitcnt lgkmcnt(0) 150; VI-NEXT: s_and_b32 s2, s2, 0x7fff 151; VI-NEXT: v_mov_b32_e32 v0, s0 152; VI-NEXT: v_mov_b32_e32 v1, s1 153; VI-NEXT: v_mov_b32_e32 v2, s2 154; VI-NEXT: flat_store_short v[0:1], v2 155; VI-NEXT: s_endpgm 156; 157; GFX9-LABEL: s_test_copysign_f16_1: 158; GFX9: ; %bb.0: 159; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 160; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 161; GFX9-NEXT: v_mov_b32_e32 v0, 0 162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 163; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 164; GFX9-NEXT: v_mov_b32_e32 v1, s2 165; GFX9-NEXT: global_store_short v0, v1, s[0:1] 166; GFX9-NEXT: s_endpgm 167; 168; GFX11-LABEL: s_test_copysign_f16_1: 169; GFX11: ; %bb.0: 170; GFX11-NEXT: s_clause 0x1 171; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 172; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 173; GFX11-NEXT: s_waitcnt lgkmcnt(0) 174; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff 175; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 176; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 177; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 178; GFX11-NEXT: s_endpgm 179 %result = call half @llvm.copysign.f16(half %mag, half 1.0) 180 store half %result, ptr addrspace(1) %out, align 4 181 ret void 182} 183 184define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half %mag) { 185; SI-LABEL: s_test_copysign_f16_10.0: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dword s6, s[4:5], 0xb 188; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 189; SI-NEXT: s_mov_b32 s3, 0xf000 190; SI-NEXT: s_mov_b32 s2, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_and_b32 s4, s6, 0x7fff 193; SI-NEXT: v_mov_b32_e32 v0, s4 194; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 195; SI-NEXT: s_endpgm 196; 197; VI-LABEL: s_test_copysign_f16_10.0: 198; VI: ; %bb.0: 199; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 200; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 201; VI-NEXT: s_waitcnt lgkmcnt(0) 202; VI-NEXT: s_and_b32 s2, s2, 0x7fff 203; VI-NEXT: v_mov_b32_e32 v0, s0 204; VI-NEXT: v_mov_b32_e32 v1, s1 205; VI-NEXT: v_mov_b32_e32 v2, s2 206; VI-NEXT: flat_store_short v[0:1], v2 207; VI-NEXT: s_endpgm 208; 209; GFX9-LABEL: s_test_copysign_f16_10.0: 210; GFX9: ; %bb.0: 211; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 212; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 213; GFX9-NEXT: v_mov_b32_e32 v0, 0 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 216; GFX9-NEXT: v_mov_b32_e32 v1, s2 217; GFX9-NEXT: global_store_short v0, v1, s[0:1] 218; GFX9-NEXT: s_endpgm 219; 220; GFX11-LABEL: s_test_copysign_f16_10.0: 221; GFX11: ; %bb.0: 222; GFX11-NEXT: s_clause 0x1 223; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 224; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 225; GFX11-NEXT: s_waitcnt lgkmcnt(0) 226; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff 227; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 228; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 229; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 230; GFX11-NEXT: s_endpgm 231 %result = call half @llvm.copysign.f16(half %mag, half 10.0) 232 store half %result, ptr addrspace(1) %out, align 4 233 ret void 234} 235 236define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half %mag) { 237; SI-LABEL: s_test_copysign_f16_neg1: 238; SI: ; %bb.0: 239; SI-NEXT: s_load_dword s6, s[4:5], 0xb 240; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 241; SI-NEXT: s_mov_b32 s3, 0xf000 242; SI-NEXT: s_mov_b32 s2, -1 243; SI-NEXT: s_waitcnt lgkmcnt(0) 244; SI-NEXT: s_or_b32 s4, s6, 0x8000 245; SI-NEXT: v_mov_b32_e32 v0, s4 246; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 247; SI-NEXT: s_endpgm 248; 249; VI-LABEL: s_test_copysign_f16_neg1: 250; VI: ; %bb.0: 251; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 252; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 253; VI-NEXT: s_waitcnt lgkmcnt(0) 254; VI-NEXT: s_bitset1_b32 s2, 15 255; VI-NEXT: v_mov_b32_e32 v0, s0 256; VI-NEXT: v_mov_b32_e32 v1, s1 257; VI-NEXT: v_mov_b32_e32 v2, s2 258; VI-NEXT: flat_store_short v[0:1], v2 259; VI-NEXT: s_endpgm 260; 261; GFX9-LABEL: s_test_copysign_f16_neg1: 262; GFX9: ; %bb.0: 263; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 264; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 265; GFX9-NEXT: v_mov_b32_e32 v0, 0 266; GFX9-NEXT: s_waitcnt lgkmcnt(0) 267; GFX9-NEXT: s_bitset1_b32 s2, 15 268; GFX9-NEXT: v_mov_b32_e32 v1, s2 269; GFX9-NEXT: global_store_short v0, v1, s[0:1] 270; GFX9-NEXT: s_endpgm 271; 272; GFX11-LABEL: s_test_copysign_f16_neg1: 273; GFX11: ; %bb.0: 274; GFX11-NEXT: s_clause 0x1 275; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 276; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 277; GFX11-NEXT: s_waitcnt lgkmcnt(0) 278; GFX11-NEXT: s_bitset1_b32 s2, 15 279; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 280; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 281; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 282; GFX11-NEXT: s_endpgm 283 %result = call half @llvm.copysign.f16(half %mag, half -1.0) 284 store half %result, ptr addrspace(1) %out, align 4 285 ret void 286} 287 288define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half %mag) { 289; SI-LABEL: s_test_copysign_f16_neg10: 290; SI: ; %bb.0: 291; SI-NEXT: s_load_dword s6, s[4:5], 0xb 292; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 293; SI-NEXT: s_mov_b32 s3, 0xf000 294; SI-NEXT: s_mov_b32 s2, -1 295; SI-NEXT: s_waitcnt lgkmcnt(0) 296; SI-NEXT: s_or_b32 s4, s6, 0x8000 297; SI-NEXT: v_mov_b32_e32 v0, s4 298; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 299; SI-NEXT: s_endpgm 300; 301; VI-LABEL: s_test_copysign_f16_neg10: 302; VI: ; %bb.0: 303; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 304; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 305; VI-NEXT: s_waitcnt lgkmcnt(0) 306; VI-NEXT: s_bitset1_b32 s2, 15 307; VI-NEXT: v_mov_b32_e32 v0, s0 308; VI-NEXT: v_mov_b32_e32 v1, s1 309; VI-NEXT: v_mov_b32_e32 v2, s2 310; VI-NEXT: flat_store_short v[0:1], v2 311; VI-NEXT: s_endpgm 312; 313; GFX9-LABEL: s_test_copysign_f16_neg10: 314; GFX9: ; %bb.0: 315; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 316; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 317; GFX9-NEXT: v_mov_b32_e32 v0, 0 318; GFX9-NEXT: s_waitcnt lgkmcnt(0) 319; GFX9-NEXT: s_bitset1_b32 s2, 15 320; GFX9-NEXT: v_mov_b32_e32 v1, s2 321; GFX9-NEXT: global_store_short v0, v1, s[0:1] 322; GFX9-NEXT: s_endpgm 323; 324; GFX11-LABEL: s_test_copysign_f16_neg10: 325; GFX11: ; %bb.0: 326; GFX11-NEXT: s_clause 0x1 327; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 328; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 329; GFX11-NEXT: s_waitcnt lgkmcnt(0) 330; GFX11-NEXT: s_bitset1_b32 s2, 15 331; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 332; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 333; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 334; GFX11-NEXT: s_endpgm 335 %result = call half @llvm.copysign.f16(half %mag, half -10.0) 336 store half %result, ptr addrspace(1) %out, align 4 337 ret void 338} 339 340define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half %sign) { 341; SI-LABEL: s_test_copysign_f16_0_mag: 342; SI: ; %bb.0: 343; SI-NEXT: s_load_dword s0, s[4:5], 0xb 344; SI-NEXT: s_brev_b32 s2, -2 345; SI-NEXT: s_mov_b32 s3, 0xf000 346; SI-NEXT: s_waitcnt lgkmcnt(0) 347; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 348; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 349; SI-NEXT: v_bfi_b32 v0, s2, 0, v0 350; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 351; SI-NEXT: s_mov_b32 s2, -1 352; SI-NEXT: s_waitcnt lgkmcnt(0) 353; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 354; SI-NEXT: s_endpgm 355; 356; VI-LABEL: s_test_copysign_f16_0_mag: 357; VI: ; %bb.0: 358; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 359; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 360; VI-NEXT: s_waitcnt lgkmcnt(0) 361; VI-NEXT: s_and_b32 s2, s2, 0x8000 362; VI-NEXT: v_mov_b32_e32 v0, s0 363; VI-NEXT: v_mov_b32_e32 v1, s1 364; VI-NEXT: v_mov_b32_e32 v2, s2 365; VI-NEXT: flat_store_short v[0:1], v2 366; VI-NEXT: s_endpgm 367; 368; GFX9-LABEL: s_test_copysign_f16_0_mag: 369; GFX9: ; %bb.0: 370; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 371; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 372; GFX9-NEXT: v_mov_b32_e32 v0, 0 373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 375; GFX9-NEXT: v_mov_b32_e32 v1, s2 376; GFX9-NEXT: global_store_short v0, v1, s[0:1] 377; GFX9-NEXT: s_endpgm 378; 379; GFX11-LABEL: s_test_copysign_f16_0_mag: 380; GFX11: ; %bb.0: 381; GFX11-NEXT: s_clause 0x1 382; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 383; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 384; GFX11-NEXT: s_waitcnt lgkmcnt(0) 385; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 386; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 387; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 388; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 389; GFX11-NEXT: s_endpgm 390 %result = call half @llvm.copysign.f16(half 0.0, half %sign) 391 store half %result, ptr addrspace(1) %out, align 4 392 ret void 393} 394 395 396define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half %sign) { 397; SI-LABEL: s_test_copysign_f16_1_mag: 398; SI: ; %bb.0: 399; SI-NEXT: s_load_dword s0, s[4:5], 0xb 400; SI-NEXT: s_brev_b32 s2, -2 401; SI-NEXT: s_mov_b32 s3, 0xf000 402; SI-NEXT: s_waitcnt lgkmcnt(0) 403; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 404; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 405; SI-NEXT: v_bfi_b32 v0, s2, 1.0, v0 406; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 407; SI-NEXT: s_mov_b32 s2, -1 408; SI-NEXT: s_waitcnt lgkmcnt(0) 409; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 410; SI-NEXT: s_endpgm 411; 412; VI-LABEL: s_test_copysign_f16_1_mag: 413; VI: ; %bb.0: 414; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 415; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 416; VI-NEXT: s_waitcnt lgkmcnt(0) 417; VI-NEXT: s_and_b32 s2, s2, 0x8000 418; VI-NEXT: s_or_b32 s2, s2, 0x3c00 419; VI-NEXT: v_mov_b32_e32 v0, s0 420; VI-NEXT: v_mov_b32_e32 v1, s1 421; VI-NEXT: v_mov_b32_e32 v2, s2 422; VI-NEXT: flat_store_short v[0:1], v2 423; VI-NEXT: s_endpgm 424; 425; GFX9-LABEL: s_test_copysign_f16_1_mag: 426; GFX9: ; %bb.0: 427; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 428; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 429; GFX9-NEXT: v_mov_b32_e32 v0, 0 430; GFX9-NEXT: s_waitcnt lgkmcnt(0) 431; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 432; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 433; GFX9-NEXT: v_mov_b32_e32 v1, s2 434; GFX9-NEXT: global_store_short v0, v1, s[0:1] 435; GFX9-NEXT: s_endpgm 436; 437; GFX11-LABEL: s_test_copysign_f16_1_mag: 438; GFX11: ; %bb.0: 439; GFX11-NEXT: s_clause 0x1 440; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 441; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 442; GFX11-NEXT: s_waitcnt lgkmcnt(0) 443; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 444; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 445; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 446; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 447; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 448; GFX11-NEXT: s_endpgm 449 %result = call half @llvm.copysign.f16(half 1.0, half %sign) 450 store half %result, ptr addrspace(1) %out, align 4 451 ret void 452} 453 454define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, half %sign) { 455; SI-LABEL: s_test_copysign_f16_10_mag: 456; SI: ; %bb.0: 457; SI-NEXT: s_load_dword s0, s[4:5], 0xb 458; SI-NEXT: s_brev_b32 s2, -2 459; SI-NEXT: v_mov_b32_e32 v1, 0x41200000 460; SI-NEXT: s_mov_b32 s3, 0xf000 461; SI-NEXT: s_waitcnt lgkmcnt(0) 462; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 463; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 464; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 465; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 466; SI-NEXT: s_mov_b32 s2, -1 467; SI-NEXT: s_waitcnt lgkmcnt(0) 468; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 469; SI-NEXT: s_endpgm 470; 471; VI-LABEL: s_test_copysign_f16_10_mag: 472; VI: ; %bb.0: 473; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 474; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 475; VI-NEXT: s_waitcnt lgkmcnt(0) 476; VI-NEXT: s_and_b32 s2, s2, 0x8000 477; VI-NEXT: s_or_b32 s2, s2, 0x4900 478; VI-NEXT: v_mov_b32_e32 v0, s0 479; VI-NEXT: v_mov_b32_e32 v1, s1 480; VI-NEXT: v_mov_b32_e32 v2, s2 481; VI-NEXT: flat_store_short v[0:1], v2 482; VI-NEXT: s_endpgm 483; 484; GFX9-LABEL: s_test_copysign_f16_10_mag: 485; GFX9: ; %bb.0: 486; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 487; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 488; GFX9-NEXT: v_mov_b32_e32 v0, 0 489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 490; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 491; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 492; GFX9-NEXT: v_mov_b32_e32 v1, s2 493; GFX9-NEXT: global_store_short v0, v1, s[0:1] 494; GFX9-NEXT: s_endpgm 495; 496; GFX11-LABEL: s_test_copysign_f16_10_mag: 497; GFX11: ; %bb.0: 498; GFX11-NEXT: s_clause 0x1 499; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 500; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 501; GFX11-NEXT: s_waitcnt lgkmcnt(0) 502; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 503; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 504; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 505; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 506; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 507; GFX11-NEXT: s_endpgm 508 %result = call half @llvm.copysign.f16(half 10.0, half %sign) 509 store half %result, ptr addrspace(1) %out, align 4 510 ret void 511} 512 513define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, half %sign) { 514; SI-LABEL: s_test_copysign_f16_neg1_mag: 515; SI: ; %bb.0: 516; SI-NEXT: s_load_dword s0, s[4:5], 0xb 517; SI-NEXT: s_brev_b32 s2, -2 518; SI-NEXT: s_mov_b32 s3, 0xf000 519; SI-NEXT: s_waitcnt lgkmcnt(0) 520; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 521; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 522; SI-NEXT: v_bfi_b32 v0, s2, -1.0, v0 523; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 524; SI-NEXT: s_mov_b32 s2, -1 525; SI-NEXT: s_waitcnt lgkmcnt(0) 526; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 527; SI-NEXT: s_endpgm 528; 529; VI-LABEL: s_test_copysign_f16_neg1_mag: 530; VI: ; %bb.0: 531; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 532; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 533; VI-NEXT: s_waitcnt lgkmcnt(0) 534; VI-NEXT: s_and_b32 s2, s2, 0x8000 535; VI-NEXT: s_or_b32 s2, s2, 0x3c00 536; VI-NEXT: v_mov_b32_e32 v0, s0 537; VI-NEXT: v_mov_b32_e32 v1, s1 538; VI-NEXT: v_mov_b32_e32 v2, s2 539; VI-NEXT: flat_store_short v[0:1], v2 540; VI-NEXT: s_endpgm 541; 542; GFX9-LABEL: s_test_copysign_f16_neg1_mag: 543; GFX9: ; %bb.0: 544; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 545; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 546; GFX9-NEXT: v_mov_b32_e32 v0, 0 547; GFX9-NEXT: s_waitcnt lgkmcnt(0) 548; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 549; GFX9-NEXT: s_or_b32 s2, s2, 0x3c00 550; GFX9-NEXT: v_mov_b32_e32 v1, s2 551; GFX9-NEXT: global_store_short v0, v1, s[0:1] 552; GFX9-NEXT: s_endpgm 553; 554; GFX11-LABEL: s_test_copysign_f16_neg1_mag: 555; GFX11: ; %bb.0: 556; GFX11-NEXT: s_clause 0x1 557; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 558; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 559; GFX11-NEXT: s_waitcnt lgkmcnt(0) 560; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 561; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 562; GFX11-NEXT: s_or_b32 s2, s2, 0x3c00 563; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 564; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 565; GFX11-NEXT: s_endpgm 566 %result = call half @llvm.copysign.f16(half -1.0, half %sign) 567 store half %result, ptr addrspace(1) %out, align 4 568 ret void 569} 570 571define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out, half %sign) { 572; SI-LABEL: s_test_copysign_f16_neg10_mag: 573; SI: ; %bb.0: 574; SI-NEXT: s_load_dword s0, s[4:5], 0xb 575; SI-NEXT: s_brev_b32 s2, -2 576; SI-NEXT: v_mov_b32_e32 v1, 0xc1200000 577; SI-NEXT: s_mov_b32 s3, 0xf000 578; SI-NEXT: s_waitcnt lgkmcnt(0) 579; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 580; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 581; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 582; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 583; SI-NEXT: s_mov_b32 s2, -1 584; SI-NEXT: s_waitcnt lgkmcnt(0) 585; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 586; SI-NEXT: s_endpgm 587; 588; VI-LABEL: s_test_copysign_f16_neg10_mag: 589; VI: ; %bb.0: 590; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 591; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 592; VI-NEXT: s_waitcnt lgkmcnt(0) 593; VI-NEXT: s_and_b32 s2, s2, 0x8000 594; VI-NEXT: s_or_b32 s2, s2, 0x4900 595; VI-NEXT: v_mov_b32_e32 v0, s0 596; VI-NEXT: v_mov_b32_e32 v1, s1 597; VI-NEXT: v_mov_b32_e32 v2, s2 598; VI-NEXT: flat_store_short v[0:1], v2 599; VI-NEXT: s_endpgm 600; 601; GFX9-LABEL: s_test_copysign_f16_neg10_mag: 602; GFX9: ; %bb.0: 603; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 604; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 605; GFX9-NEXT: v_mov_b32_e32 v0, 0 606; GFX9-NEXT: s_waitcnt lgkmcnt(0) 607; GFX9-NEXT: s_and_b32 s2, s2, 0x8000 608; GFX9-NEXT: s_or_b32 s2, s2, 0x4900 609; GFX9-NEXT: v_mov_b32_e32 v1, s2 610; GFX9-NEXT: global_store_short v0, v1, s[0:1] 611; GFX9-NEXT: s_endpgm 612; 613; GFX11-LABEL: s_test_copysign_f16_neg10_mag: 614; GFX11: ; %bb.0: 615; GFX11-NEXT: s_clause 0x1 616; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 617; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 618; GFX11-NEXT: s_waitcnt lgkmcnt(0) 619; GFX11-NEXT: s_and_b32 s2, s2, 0x8000 620; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 621; GFX11-NEXT: s_or_b32 s2, s2, 0x4900 622; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 623; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 624; GFX11-NEXT: s_endpgm 625 %result = call half @llvm.copysign.f16(half -10.0, half %sign) 626 store half %result, ptr addrspace(1) %out, align 4 627 ret void 628} 629 630define half @v_copysign_f16(half %mag, half %sign) { 631; SI-LABEL: v_copysign_f16: 632; SI: ; %bb.0: 633; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 634; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 635; SI-NEXT: s_brev_b32 s4, -2 636; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 637; SI-NEXT: v_bfi_b32 v0, s4, v0, v1 638; SI-NEXT: s_setpc_b64 s[30:31] 639; 640; VI-LABEL: v_copysign_f16: 641; VI: ; %bb.0: 642; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 643; VI-NEXT: s_movk_i32 s4, 0x7fff 644; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 645; VI-NEXT: s_setpc_b64 s[30:31] 646; 647; GFX9-LABEL: v_copysign_f16: 648; GFX9: ; %bb.0: 649; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 650; GFX9-NEXT: s_movk_i32 s4, 0x7fff 651; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 652; GFX9-NEXT: s_setpc_b64 s[30:31] 653; 654; GFX11-LABEL: v_copysign_f16: 655; GFX11: ; %bb.0: 656; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 657; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 658; GFX11-NEXT: s_setpc_b64 s[30:31] 659 %result = call half @llvm.copysign.f16(half %mag, half %sign) 660 ret half %result 661} 662 663define half @v_test_copysign_f16_0(half %mag) { 664; SI-LABEL: v_test_copysign_f16_0: 665; SI: ; %bb.0: 666; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 667; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 668; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 669; SI-NEXT: s_setpc_b64 s[30:31] 670; 671; VI-LABEL: v_test_copysign_f16_0: 672; VI: ; %bb.0: 673; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 674; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 675; VI-NEXT: s_setpc_b64 s[30:31] 676; 677; GFX9-LABEL: v_test_copysign_f16_0: 678; GFX9: ; %bb.0: 679; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 680; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 681; GFX9-NEXT: s_setpc_b64 s[30:31] 682; 683; GFX11-LABEL: v_test_copysign_f16_0: 684; GFX11: ; %bb.0: 685; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 686; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 687; GFX11-NEXT: s_setpc_b64 s[30:31] 688 %result = call half @llvm.copysign.f16(half %mag, half 0.0) 689 ret half %result 690} 691 692define half @v_test_copysign_f16_1(half %mag) { 693; SI-LABEL: v_test_copysign_f16_1: 694; SI: ; %bb.0: 695; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 696; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 697; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 698; SI-NEXT: s_setpc_b64 s[30:31] 699; 700; VI-LABEL: v_test_copysign_f16_1: 701; VI: ; %bb.0: 702; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 703; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 704; VI-NEXT: s_setpc_b64 s[30:31] 705; 706; GFX9-LABEL: v_test_copysign_f16_1: 707; GFX9: ; %bb.0: 708; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 709; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 710; GFX9-NEXT: s_setpc_b64 s[30:31] 711; 712; GFX11-LABEL: v_test_copysign_f16_1: 713; GFX11: ; %bb.0: 714; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 715; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 716; GFX11-NEXT: s_setpc_b64 s[30:31] 717 %result = call half @llvm.copysign.f16(half %mag, half 1.0) 718 ret half %result 719} 720 721define half @v_test_copysign_f16_10(half %mag) { 722; SI-LABEL: v_test_copysign_f16_10: 723; SI: ; %bb.0: 724; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 725; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 726; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 727; SI-NEXT: s_setpc_b64 s[30:31] 728; 729; VI-LABEL: v_test_copysign_f16_10: 730; VI: ; %bb.0: 731; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 732; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 733; VI-NEXT: s_setpc_b64 s[30:31] 734; 735; GFX9-LABEL: v_test_copysign_f16_10: 736; GFX9: ; %bb.0: 737; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 738; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 739; GFX9-NEXT: s_setpc_b64 s[30:31] 740; 741; GFX11-LABEL: v_test_copysign_f16_10: 742; GFX11: ; %bb.0: 743; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0 745; GFX11-NEXT: s_setpc_b64 s[30:31] 746 %result = call half @llvm.copysign.f16(half %mag, half 10.0) 747 ret half %result 748} 749 750define half @v_test_copysign_f16_neg1(half %mag) { 751; SI-LABEL: v_test_copysign_f16_neg1: 752; SI: ; %bb.0: 753; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 754; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 755; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 756; SI-NEXT: s_setpc_b64 s[30:31] 757; 758; VI-LABEL: v_test_copysign_f16_neg1: 759; VI: ; %bb.0: 760; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 761; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0 762; VI-NEXT: s_setpc_b64 s[30:31] 763; 764; GFX9-LABEL: v_test_copysign_f16_neg1: 765; GFX9: ; %bb.0: 766; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 767; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 768; GFX9-NEXT: s_setpc_b64 s[30:31] 769; 770; GFX11-LABEL: v_test_copysign_f16_neg1: 771; GFX11: ; %bb.0: 772; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 773; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 774; GFX11-NEXT: s_setpc_b64 s[30:31] 775 %result = call half @llvm.copysign.f16(half %mag, half -1.0) 776 ret half %result 777} 778 779define half @v_test_copysign_f16_neg10(half %mag) { 780; SI-LABEL: v_test_copysign_f16_neg10: 781; SI: ; %bb.0: 782; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 783; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 784; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| 785; SI-NEXT: s_setpc_b64 s[30:31] 786; 787; VI-LABEL: v_test_copysign_f16_neg10: 788; VI: ; %bb.0: 789; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 790; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0 791; VI-NEXT: s_setpc_b64 s[30:31] 792; 793; GFX9-LABEL: v_test_copysign_f16_neg10: 794; GFX9: ; %bb.0: 795; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 796; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0 797; GFX9-NEXT: s_setpc_b64 s[30:31] 798; 799; GFX11-LABEL: v_test_copysign_f16_neg10: 800; GFX11: ; %bb.0: 801; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 802; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0 803; GFX11-NEXT: s_setpc_b64 s[30:31] 804 %result = call half @llvm.copysign.f16(half %mag, half -10.0) 805 ret half %result 806} 807 808define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 809; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: 810; SI: ; %bb.0: 811; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 812; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 813; SI-NEXT: s_mov_b32 s11, 0xf000 814; SI-NEXT: s_mov_b32 s14, 0 815; SI-NEXT: s_mov_b32 s15, s11 816; SI-NEXT: s_waitcnt lgkmcnt(0) 817; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 818; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 819; SI-NEXT: v_mov_b32_e32 v2, 0 820; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 821; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 822; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 823; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 824; SI-NEXT: s_mov_b32 s8, s0 825; SI-NEXT: s_brev_b32 s0, -2 826; SI-NEXT: s_mov_b32 s10, -1 827; SI-NEXT: s_mov_b32 s9, s1 828; SI-NEXT: s_waitcnt vmcnt(1) 829; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 830; SI-NEXT: s_waitcnt vmcnt(0) 831; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 832; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 833; SI-NEXT: s_endpgm 834; 835; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32: 836; VI: ; %bb.0: 837; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 838; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 839; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 840; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 841; VI-NEXT: s_waitcnt lgkmcnt(0) 842; VI-NEXT: v_mov_b32_e32 v2, s3 843; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 844; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 845; VI-NEXT: flat_load_ushort v2, v[1:2] 846; VI-NEXT: v_mov_b32_e32 v1, s5 847; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 848; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 849; VI-NEXT: flat_load_dword v3, v[0:1] 850; VI-NEXT: v_mov_b32_e32 v0, s0 851; VI-NEXT: s_brev_b32 s0, -2 852; VI-NEXT: v_mov_b32_e32 v1, s1 853; VI-NEXT: s_waitcnt vmcnt(1) 854; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 855; VI-NEXT: s_waitcnt vmcnt(0) 856; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 857; VI-NEXT: flat_store_dword v[0:1], v2 858; VI-NEXT: s_endpgm 859; 860; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32: 861; GFX9: ; %bb.0: 862; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 863; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 864; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 865; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 866; GFX9-NEXT: v_mov_b32_e32 v2, 0 867; GFX9-NEXT: s_waitcnt lgkmcnt(0) 868; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] 869; GFX9-NEXT: s_brev_b32 s2, -2 870; GFX9-NEXT: global_load_dword v0, v0, s[6:7] 871; GFX9-NEXT: s_waitcnt vmcnt(1) 872; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 873; GFX9-NEXT: s_waitcnt vmcnt(0) 874; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 875; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 876; GFX9-NEXT: s_endpgm 877; 878; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32: 879; GFX11: ; %bb.0: 880; GFX11-NEXT: s_clause 0x1 881; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 882; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 883; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 884; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 885; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 886; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 887; GFX11-NEXT: s_waitcnt lgkmcnt(0) 888; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] 889; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] 890; GFX11-NEXT: s_waitcnt vmcnt(1) 891; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 892; GFX11-NEXT: s_waitcnt vmcnt(0) 893; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 894; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0 895; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 896; GFX11-NEXT: s_endpgm 897 %tid = call i32 @llvm.amdgcn.workitem.id.x() 898 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid 899 %mag = load half, ptr addrspace(1) %arg_mag_gep 900 %mag.ext = fpext half %mag to float 901 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid 902 %sign = load float, ptr addrspace(1) %arg_sign_gep 903 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign) 904 store float %out, ptr addrspace(1) %arg_out 905 ret void 906} 907 908define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 909; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: 910; SI: ; %bb.0: 911; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 912; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 913; SI-NEXT: s_mov_b32 s11, 0xf000 914; SI-NEXT: s_mov_b32 s14, 0 915; SI-NEXT: s_mov_b32 s15, s11 916; SI-NEXT: s_waitcnt lgkmcnt(0) 917; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 918; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 919; SI-NEXT: v_mov_b32_e32 v2, 0 920; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 921; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 922; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 923; SI-NEXT: buffer_load_dwordx2 v[0:1], v[1:2], s[4:7], 0 addr64 924; SI-NEXT: s_mov_b32 s8, s0 925; SI-NEXT: s_brev_b32 s0, -2 926; SI-NEXT: s_mov_b32 s10, -1 927; SI-NEXT: s_mov_b32 s9, s1 928; SI-NEXT: s_waitcnt vmcnt(0) 929; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 930; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 931; SI-NEXT: v_bfi_b32 v3, s0, v3, v1 932; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 933; SI-NEXT: s_endpgm 934; 935; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64: 936; VI: ; %bb.0: 937; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 938; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 939; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 940; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 941; VI-NEXT: s_waitcnt lgkmcnt(0) 942; VI-NEXT: v_mov_b32_e32 v2, s3 943; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 944; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 945; VI-NEXT: flat_load_ushort v2, v[1:2] 946; VI-NEXT: v_mov_b32_e32 v1, s5 947; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 948; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 949; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 950; VI-NEXT: v_mov_b32_e32 v4, s0 951; VI-NEXT: s_brev_b32 s0, -2 952; VI-NEXT: v_mov_b32_e32 v5, s1 953; VI-NEXT: s_waitcnt vmcnt(0) 954; VI-NEXT: v_cvt_f32_f16_e32 v0, v2 955; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 956; VI-NEXT: v_bfi_b32 v3, s0, v3, v1 957; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3] 958; VI-NEXT: s_endpgm 959; 960; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64: 961; GFX9: ; %bb.0: 962; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 963; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 964; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 965; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 966; GFX9-NEXT: s_waitcnt lgkmcnt(0) 967; GFX9-NEXT: global_load_ushort v2, v1, s[2:3] 968; GFX9-NEXT: s_nop 0 969; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 970; GFX9-NEXT: s_brev_b32 s2, -2 971; GFX9-NEXT: s_waitcnt vmcnt(0) 972; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v2 973; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 974; GFX9-NEXT: v_mov_b32_e32 v0, 0 975; GFX9-NEXT: v_bfi_b32 v3, s2, v3, v1 976; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] 977; GFX9-NEXT: s_endpgm 978; 979; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64: 980; GFX11: ; %bb.0: 981; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 982; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 983; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 984; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 985; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 986; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 987; GFX11-NEXT: s_waitcnt lgkmcnt(0) 988; GFX11-NEXT: global_load_u16 v2, v1, s[2:3] 989; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] 990; GFX11-NEXT: s_waitcnt vmcnt(0) 991; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2 992; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 993; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 994; GFX11-NEXT: v_mov_b32_e32 v0, 0 995; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1 996; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] 997; GFX11-NEXT: s_endpgm 998 %tid = call i32 @llvm.amdgcn.workitem.id.x() 999 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid 1000 %mag = load half, ptr addrspace(1) %arg_mag_gep 1001 %mag.ext = fpext half %mag to double 1002 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid 1003 %sign = load double, ptr addrspace(1) %arg_sign_gep 1004 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign) 1005 store double %out, ptr addrspace(1) %arg_out 1006 ret void 1007} 1008 1009define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 1010; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: 1011; SI: ; %bb.0: 1012; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1013; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1014; SI-NEXT: s_mov_b32 s11, 0xf000 1015; SI-NEXT: s_mov_b32 s14, 0 1016; SI-NEXT: s_mov_b32 s15, s11 1017; SI-NEXT: s_waitcnt lgkmcnt(0) 1018; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 1019; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1020; SI-NEXT: v_mov_b32_e32 v2, 0 1021; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64 1022; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 1023; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1024; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64 1025; SI-NEXT: s_mov_b32 s8, s0 1026; SI-NEXT: s_brev_b32 s0, -2 1027; SI-NEXT: s_mov_b32 s10, -1 1028; SI-NEXT: s_mov_b32 s9, s1 1029; SI-NEXT: s_waitcnt vmcnt(0) 1030; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1031; SI-NEXT: v_bfi_b32 v0, s0, v3, v0 1032; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 1033; SI-NEXT: s_endpgm 1034; 1035; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16: 1036; VI: ; %bb.0: 1037; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1038; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1039; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1040; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1041; VI-NEXT: s_waitcnt lgkmcnt(0) 1042; VI-NEXT: v_mov_b32_e32 v3, s3 1043; VI-NEXT: v_mov_b32_e32 v1, s5 1044; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1045; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1046; VI-NEXT: flat_load_ushort v4, v[0:1] 1047; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1048; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1049; VI-NEXT: flat_load_dword v2, v[0:1] 1050; VI-NEXT: v_mov_b32_e32 v0, s0 1051; VI-NEXT: s_brev_b32 s0, -2 1052; VI-NEXT: v_mov_b32_e32 v1, s1 1053; VI-NEXT: s_waitcnt vmcnt(1) 1054; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1055; VI-NEXT: s_waitcnt vmcnt(0) 1056; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1057; VI-NEXT: flat_store_dword v[0:1], v2 1058; VI-NEXT: s_endpgm 1059; 1060; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16: 1061; GFX9: ; %bb.0: 1062; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1063; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1064; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1065; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1066; GFX9-NEXT: v_mov_b32_e32 v2, 0 1067; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1068; GFX9-NEXT: global_load_ushort v1, v1, s[6:7] 1069; GFX9-NEXT: s_waitcnt vmcnt(0) 1070; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1071; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 1072; GFX9-NEXT: s_brev_b32 s2, -2 1073; GFX9-NEXT: s_waitcnt vmcnt(0) 1074; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 1075; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 1076; GFX9-NEXT: s_endpgm 1077; 1078; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: 1079; GFX11: ; %bb.0: 1080; GFX11-NEXT: s_clause 0x1 1081; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1082; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1083; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1084; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1085; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0 1086; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1087; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX11-NEXT: global_load_u16 v1, v1, s[6:7] 1089; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1090; GFX11-NEXT: s_waitcnt vmcnt(1) 1091; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1092; GFX11-NEXT: s_waitcnt vmcnt(0) 1093; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1094; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 1095; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 1096; GFX11-NEXT: s_endpgm 1097 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1098 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid 1099 %mag = load float, ptr addrspace(1) %arg_mag_gep 1100 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid 1101 %sign = load half, ptr addrspace(1) %arg_sign_gep 1102 %sign.ext = fpext half %sign to float 1103 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) 1104 store float %out, ptr addrspace(1) %arg_out 1105 ret void 1106} 1107 1108define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 1109; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: 1110; SI: ; %bb.0: 1111; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1112; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1113; SI-NEXT: s_mov_b32 s11, 0xf000 1114; SI-NEXT: s_mov_b32 s14, 0 1115; SI-NEXT: s_mov_b32 s15, s11 1116; SI-NEXT: v_mov_b32_e32 v1, 0 1117; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 1118; SI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1119; SI-NEXT: v_mov_b32_e32 v3, v1 1120; SI-NEXT: s_waitcnt lgkmcnt(0) 1121; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 1122; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 1123; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1124; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 1125; SI-NEXT: s_mov_b32 s8, s0 1126; SI-NEXT: s_brev_b32 s0, -2 1127; SI-NEXT: s_mov_b32 s10, -1 1128; SI-NEXT: s_mov_b32 s9, s1 1129; SI-NEXT: s_waitcnt vmcnt(1) 1130; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1131; SI-NEXT: s_waitcnt vmcnt(0) 1132; SI-NEXT: v_bfi_b32 v1, s0, v1, v2 1133; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1134; SI-NEXT: s_endpgm 1135; 1136; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16: 1137; VI: ; %bb.0: 1138; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1139; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1140; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1141; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1142; VI-NEXT: s_waitcnt lgkmcnt(0) 1143; VI-NEXT: v_mov_b32_e32 v3, s3 1144; VI-NEXT: v_mov_b32_e32 v1, s5 1145; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1146; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1147; VI-NEXT: flat_load_ushort v4, v[0:1] 1148; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1149; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1150; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1151; VI-NEXT: v_mov_b32_e32 v2, s0 1152; VI-NEXT: s_brev_b32 s0, -2 1153; VI-NEXT: v_mov_b32_e32 v3, s1 1154; VI-NEXT: s_waitcnt vmcnt(1) 1155; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 1156; VI-NEXT: s_waitcnt vmcnt(0) 1157; VI-NEXT: v_bfi_b32 v1, s0, v1, v4 1158; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1159; VI-NEXT: s_endpgm 1160; 1161; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16: 1162; GFX9: ; %bb.0: 1163; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1164; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1165; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1166; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1167; GFX9-NEXT: v_mov_b32_e32 v3, 0 1168; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX9-NEXT: global_load_ushort v2, v1, s[6:7] 1170; GFX9-NEXT: s_waitcnt vmcnt(0) 1171; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1172; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 1173; GFX9-NEXT: s_brev_b32 s2, -2 1174; GFX9-NEXT: s_waitcnt vmcnt(0) 1175; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 1176; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 1177; GFX9-NEXT: s_endpgm 1178; 1179; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: 1180; GFX11: ; %bb.0: 1181; GFX11-NEXT: s_clause 0x1 1182; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1183; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1184; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1185; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1186; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1187; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1188; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1189; GFX11-NEXT: global_load_u16 v2, v1, s[6:7] 1190; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1191; GFX11-NEXT: s_waitcnt vmcnt(1) 1192; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1193; GFX11-NEXT: s_waitcnt vmcnt(0) 1194; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1195; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 1196; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] 1197; GFX11-NEXT: s_endpgm 1198 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1199 %arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid 1200 %mag = load double, ptr addrspace(1) %arg_mag_gep 1201 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid 1202 %sign = load half, ptr addrspace(1) %arg_sign_gep 1203 %sign.ext = fpext half %sign to double 1204 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) 1205 store double %out, ptr addrspace(1) %arg_out 1206 ret void 1207} 1208 1209define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 1210; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: 1211; SI: ; %bb.0: 1212; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1213; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1214; SI-NEXT: s_mov_b32 s11, 0xf000 1215; SI-NEXT: s_mov_b32 s14, 0 1216; SI-NEXT: s_mov_b32 s15, s11 1217; SI-NEXT: s_waitcnt lgkmcnt(0) 1218; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 1219; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1220; SI-NEXT: v_mov_b32_e32 v2, 0 1221; SI-NEXT: buffer_load_ushort v3, v[1:2], s[12:15], 0 addr64 1222; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 1223; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1224; SI-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 1225; SI-NEXT: s_brev_b32 s2, -2 1226; SI-NEXT: s_mov_b32 s10, -1 1227; SI-NEXT: s_mov_b32 s8, s0 1228; SI-NEXT: s_mov_b32 s9, s1 1229; SI-NEXT: s_waitcnt vmcnt(1) 1230; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 1231; SI-NEXT: s_waitcnt vmcnt(0) 1232; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 1233; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1234; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 1235; SI-NEXT: s_endpgm 1236; 1237; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32: 1238; VI: ; %bb.0: 1239; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1240; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1241; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1242; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1243; VI-NEXT: s_waitcnt lgkmcnt(0) 1244; VI-NEXT: v_mov_b32_e32 v3, s3 1245; VI-NEXT: v_mov_b32_e32 v1, s5 1246; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1247; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1248; VI-NEXT: flat_load_dword v4, v[0:1] 1249; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1250; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1251; VI-NEXT: flat_load_ushort v2, v[0:1] 1252; VI-NEXT: v_mov_b32_e32 v0, s0 1253; VI-NEXT: s_movk_i32 s0, 0x7fff 1254; VI-NEXT: v_mov_b32_e32 v1, s1 1255; VI-NEXT: s_waitcnt vmcnt(1) 1256; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 1257; VI-NEXT: s_waitcnt vmcnt(0) 1258; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1259; VI-NEXT: flat_store_short v[0:1], v2 1260; VI-NEXT: s_endpgm 1261; 1262; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32: 1263; GFX9: ; %bb.0: 1264; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1265; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1266; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1267; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1268; GFX9-NEXT: v_mov_b32_e32 v2, 0 1269; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1270; GFX9-NEXT: global_load_dword v1, v1, s[6:7] 1271; GFX9-NEXT: s_waitcnt vmcnt(0) 1272; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1273; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] 1274; GFX9-NEXT: s_movk_i32 s2, 0x7fff 1275; GFX9-NEXT: s_waitcnt vmcnt(0) 1276; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1 1277; GFX9-NEXT: global_store_short v2, v0, s[0:1] 1278; GFX9-NEXT: s_endpgm 1279; 1280; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: 1281; GFX11: ; %bb.0: 1282; GFX11-NEXT: s_clause 0x1 1283; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1284; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1285; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1287; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 1288; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1289; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1290; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] 1291; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] 1292; GFX11-NEXT: s_waitcnt vmcnt(1) 1293; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1294; GFX11-NEXT: s_waitcnt vmcnt(0) 1295; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1296; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 1297; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] 1298; GFX11-NEXT: s_endpgm 1299 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1300 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid 1301 %mag = load half, ptr addrspace(1) %arg_mag_gep 1302 %arg_sign_gep = getelementptr float, ptr addrspace(1) %arg_sign, i32 %tid 1303 %sign = load float, ptr addrspace(1) %arg_sign_gep 1304 %sign.trunc = fptrunc float %sign to half 1305 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) 1306 store half %out, ptr addrspace(1) %arg_out 1307 ret void 1308} 1309 1310define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 1311; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: 1312; SI: ; %bb.0: 1313; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1314; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1315; SI-NEXT: s_mov_b32 s7, 0xf000 1316; SI-NEXT: s_mov_b32 s6, -1 1317; SI-NEXT: s_mov_b32 s14, s6 1318; SI-NEXT: s_waitcnt lgkmcnt(0) 1319; SI-NEXT: s_mov_b32 s12, s2 1320; SI-NEXT: s_mov_b32 s13, s3 1321; SI-NEXT: s_mov_b32 s15, s7 1322; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0 1323; SI-NEXT: s_mov_b32 s10, 0 1324; SI-NEXT: s_mov_b32 s11, s7 1325; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1326; SI-NEXT: v_mov_b32_e32 v1, 0 1327; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 1328; SI-NEXT: s_brev_b32 s2, -2 1329; SI-NEXT: s_mov_b32 s4, s0 1330; SI-NEXT: s_mov_b32 s5, s1 1331; SI-NEXT: s_waitcnt vmcnt(0) 1332; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 1333; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 1334; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1335; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1336; SI-NEXT: s_endpgm 1337; 1338; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64: 1339; VI: ; %bb.0: 1340; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1341; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1342; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1343; VI-NEXT: s_waitcnt lgkmcnt(0) 1344; VI-NEXT: v_mov_b32_e32 v0, s2 1345; VI-NEXT: v_mov_b32_e32 v2, s5 1346; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1347; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1348; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 1349; VI-NEXT: s_waitcnt vmcnt(0) 1350; VI-NEXT: v_mov_b32_e32 v1, s3 1351; VI-NEXT: flat_load_ushort v3, v[0:1] 1352; VI-NEXT: v_mov_b32_e32 v0, s0 1353; VI-NEXT: s_movk_i32 s0, 0x7fff 1354; VI-NEXT: v_mov_b32_e32 v1, s1 1355; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1356; VI-NEXT: s_waitcnt vmcnt(0) 1357; VI-NEXT: v_bfi_b32 v2, s0, v3, v2 1358; VI-NEXT: flat_store_short v[0:1], v2 1359; VI-NEXT: s_endpgm 1360; 1361; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64: 1362; GFX9: ; %bb.0: 1363; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1364; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1365; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1367; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] 1368; GFX9-NEXT: s_waitcnt vmcnt(0) 1369; GFX9-NEXT: v_mov_b32_e32 v0, 0 1370; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] 1371; GFX9-NEXT: s_movk_i32 s2, 0x7fff 1372; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1373; GFX9-NEXT: s_waitcnt vmcnt(0) 1374; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 1375; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1376; GFX9-NEXT: s_endpgm 1377; 1378; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: 1379; GFX11: ; %bb.0: 1380; GFX11-NEXT: s_clause 0x1 1381; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1382; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1383; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1384; GFX11-NEXT: v_mov_b32_e32 v2, 0 1385; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1386; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1387; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1388; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] 1389; GFX11-NEXT: global_load_u16 v0, v2, s[2:3] 1390; GFX11-NEXT: s_waitcnt vmcnt(1) 1391; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1392; GFX11-NEXT: s_waitcnt vmcnt(0) 1393; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1394; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 1395; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] 1396; GFX11-NEXT: s_endpgm 1397 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1398 %arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid 1399 %mag = load half, ptr addrspace(1) %arg_mag 1400 %arg_sign_gep = getelementptr double, ptr addrspace(1) %arg_sign, i32 %tid 1401 %sign = load double, ptr addrspace(1) %arg_sign_gep 1402 %sign.trunc = fptrunc double %sign to half 1403 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) 1404 store half %out, ptr addrspace(1) %arg_out 1405 ret void 1406} 1407 1408define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1) %arg_out, ptr addrspace(1) %arg_mag, ptr addrspace(1) %arg_sign) { 1409; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: 1410; SI: ; %bb.0: 1411; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1412; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1413; SI-NEXT: s_mov_b32 s11, 0xf000 1414; SI-NEXT: s_mov_b32 s14, 0 1415; SI-NEXT: s_mov_b32 s15, s11 1416; SI-NEXT: s_waitcnt lgkmcnt(0) 1417; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 1418; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1419; SI-NEXT: v_mov_b32_e32 v2, 0 1420; SI-NEXT: buffer_load_dword v3, v[1:2], s[12:15], 0 addr64 1421; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 1422; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 1423; SI-NEXT: buffer_load_ushort v0, v[1:2], s[4:7], 0 addr64 1424; SI-NEXT: s_brev_b32 s2, -2 1425; SI-NEXT: s_mov_b32 s10, -1 1426; SI-NEXT: s_mov_b32 s8, s0 1427; SI-NEXT: s_mov_b32 s9, s1 1428; SI-NEXT: s_waitcnt vmcnt(1) 1429; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 1430; SI-NEXT: s_waitcnt vmcnt(0) 1431; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1432; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1433; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 1434; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1435; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 1436; SI-NEXT: s_endpgm 1437; 1438; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16: 1439; VI: ; %bb.0: 1440; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1441; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1442; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1443; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1444; VI-NEXT: s_waitcnt lgkmcnt(0) 1445; VI-NEXT: v_mov_b32_e32 v2, s3 1446; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1447; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1448; VI-NEXT: flat_load_dword v2, v[1:2] 1449; VI-NEXT: v_mov_b32_e32 v1, s5 1450; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 1451; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1452; VI-NEXT: flat_load_ushort v3, v[0:1] 1453; VI-NEXT: v_mov_b32_e32 v0, s0 1454; VI-NEXT: s_movk_i32 s0, 0x7fff 1455; VI-NEXT: v_mov_b32_e32 v1, s1 1456; VI-NEXT: s_waitcnt vmcnt(1) 1457; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1458; VI-NEXT: s_waitcnt vmcnt(0) 1459; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1460; VI-NEXT: flat_store_short v[0:1], v2 1461; VI-NEXT: s_endpgm 1462; 1463; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16: 1464; GFX9: ; %bb.0: 1465; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1466; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1467; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1468; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1469; GFX9-NEXT: v_mov_b32_e32 v2, 0 1470; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1471; GFX9-NEXT: global_load_dword v1, v1, s[2:3] 1472; GFX9-NEXT: s_movk_i32 s2, 0x7fff 1473; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] 1474; GFX9-NEXT: s_waitcnt vmcnt(1) 1475; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 1476; GFX9-NEXT: s_waitcnt vmcnt(0) 1477; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0 1478; GFX9-NEXT: global_store_short v2, v0, s[0:1] 1479; GFX9-NEXT: s_endpgm 1480; 1481; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16: 1482; GFX11: ; %bb.0: 1483; GFX11-NEXT: s_clause 0x1 1484; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1485; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1486; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1487; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1488; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0 1489; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1490; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1491; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] 1492; GFX11-NEXT: global_load_u16 v0, v0, s[4:5] 1493; GFX11-NEXT: s_waitcnt vmcnt(1) 1494; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 1495; GFX11-NEXT: s_waitcnt vmcnt(0) 1496; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1497; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 1498; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] 1499; GFX11-NEXT: s_endpgm 1500 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1501 %arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid 1502 %mag = load float, ptr addrspace(1) %arg_mag_gep 1503 %mag.trunc = fptrunc float %mag to half 1504 %arg_sign_gep = getelementptr half, ptr addrspace(1) %arg_sign, i32 %tid 1505 %sign = load half, ptr addrspace(1) %arg_sign_gep 1506 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign) 1507 store half %out, ptr addrspace(1) %arg_out 1508 ret void 1509} 1510 1511define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) %arg_out, double %mag, half %sign) { 1512; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: 1513; SI: ; %bb.0: 1514; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1515; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1516; SI-NEXT: s_waitcnt lgkmcnt(0) 1517; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 1518; SI-NEXT: s_lshr_b32 s4, s3, 8 1519; SI-NEXT: s_and_b32 s5, s3, 0x1ff 1520; SI-NEXT: s_and_b32 s6, s4, 0xffe 1521; SI-NEXT: s_or_b32 s2, s5, s2 1522; SI-NEXT: s_cmp_lg_u32 s2, 0 1523; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1524; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 1525; SI-NEXT: v_readfirstlane_b32 s2, v1 1526; SI-NEXT: s_bfe_u32 s5, s3, 0xb0014 1527; SI-NEXT: s_or_b32 s2, s6, s2 1528; SI-NEXT: s_sub_i32 s6, 0x3f1, s5 1529; SI-NEXT: v_med3_i32 v1, s6, 0, 13 1530; SI-NEXT: s_or_b32 s4, s2, 0x1000 1531; SI-NEXT: v_readfirstlane_b32 s6, v1 1532; SI-NEXT: s_lshr_b32 s6, s4, s6 1533; SI-NEXT: v_lshl_b32_e32 v1, s6, v1 1534; SI-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 1535; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 1536; SI-NEXT: s_add_i32 s8, s5, 0xfffffc10 1537; SI-NEXT: v_readfirstlane_b32 s4, v1 1538; SI-NEXT: s_lshl_b32 s5, s8, 12 1539; SI-NEXT: s_or_b32 s4, s6, s4 1540; SI-NEXT: s_or_b32 s5, s2, s5 1541; SI-NEXT: s_cmp_lt_i32 s8, 1 1542; SI-NEXT: s_cselect_b32 s9, s4, s5 1543; SI-NEXT: s_and_b32 s6, s9, 7 1544; SI-NEXT: s_cmp_gt_i32 s6, 5 1545; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1546; SI-NEXT: s_cmp_eq_u32 s6, 3 1547; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 1548; SI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 1549; SI-NEXT: s_lshr_b32 s6, s9, 2 1550; SI-NEXT: s_or_b32 s4, s4, s5 1551; SI-NEXT: s_cmp_lg_u32 s4, 0 1552; SI-NEXT: s_addc_u32 s4, s6, 0 1553; SI-NEXT: s_cmp_lt_i32 s8, 31 1554; SI-NEXT: s_cselect_b32 s6, s4, 0x7c00 1555; SI-NEXT: s_cmp_lg_u32 s2, 0 1556; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1557; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 1558; SI-NEXT: v_lshlrev_b32_e32 v1, 9, v1 1559; SI-NEXT: s_cmpk_eq_i32 s8, 0x40f 1560; SI-NEXT: v_or_b32_e32 v1, 0x7c00, v1 1561; SI-NEXT: v_mov_b32_e32 v2, s6 1562; SI-NEXT: s_cselect_b64 vcc, -1, 0 1563; SI-NEXT: s_lshr_b32 s2, s3, 16 1564; SI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 1565; SI-NEXT: s_and_b32 s2, s2, 0x8000 1566; SI-NEXT: v_or_b32_e32 v1, s2, v1 1567; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1568; SI-NEXT: s_brev_b32 s2, -2 1569; SI-NEXT: s_mov_b32 s3, 0xf000 1570; SI-NEXT: v_bfi_b32 v0, s2, v1, v0 1571; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1572; SI-NEXT: s_mov_b32 s2, -1 1573; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1574; SI-NEXT: s_endpgm 1575; 1576; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16: 1577; VI: ; %bb.0: 1578; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1579; VI-NEXT: s_load_dword s4, s[4:5], 0x34 1580; VI-NEXT: s_waitcnt lgkmcnt(0) 1581; VI-NEXT: v_mov_b32_e32 v0, s0 1582; VI-NEXT: v_mov_b32_e32 v1, s1 1583; VI-NEXT: s_lshr_b32 s0, s3, 8 1584; VI-NEXT: s_and_b32 s1, s3, 0x1ff 1585; VI-NEXT: s_and_b32 s5, s0, 0xffe 1586; VI-NEXT: s_or_b32 s0, s1, s2 1587; VI-NEXT: s_cmp_lg_u32 s0, 0 1588; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 1589; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 1590; VI-NEXT: s_bfe_u32 s1, s3, 0xb0014 1591; VI-NEXT: v_readfirstlane_b32 s0, v2 1592; VI-NEXT: s_sub_i32 s2, 0x3f1, s1 1593; VI-NEXT: s_or_b32 s5, s5, s0 1594; VI-NEXT: v_med3_i32 v2, s2, 0, 13 1595; VI-NEXT: s_or_b32 s0, s5, 0x1000 1596; VI-NEXT: v_readfirstlane_b32 s2, v2 1597; VI-NEXT: s_lshr_b32 s2, s0, s2 1598; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 1599; VI-NEXT: v_cmp_ne_u32_e32 vcc, s0, v2 1600; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1601; VI-NEXT: s_add_i32 s6, s1, 0xfffffc10 1602; VI-NEXT: v_readfirstlane_b32 s0, v2 1603; VI-NEXT: s_lshl_b32 s1, s6, 12 1604; VI-NEXT: s_or_b32 s0, s2, s0 1605; VI-NEXT: s_or_b32 s1, s5, s1 1606; VI-NEXT: s_cmp_lt_i32 s6, 1 1607; VI-NEXT: s_cselect_b32 s7, s0, s1 1608; VI-NEXT: s_and_b32 s2, s7, 7 1609; VI-NEXT: s_cmp_gt_i32 s2, 5 1610; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 1611; VI-NEXT: s_cmp_eq_u32 s2, 3 1612; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 1613; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 1614; VI-NEXT: s_lshr_b32 s2, s7, 2 1615; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 1616; VI-NEXT: s_addc_u32 s0, s2, 0 1617; VI-NEXT: s_cmp_lt_i32 s6, 31 1618; VI-NEXT: s_cselect_b32 s2, s0, 0x7c00 1619; VI-NEXT: s_cmp_lg_u32 s5, 0 1620; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 1621; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 1622; VI-NEXT: v_lshlrev_b32_e32 v2, 9, v2 1623; VI-NEXT: s_cmpk_eq_i32 s6, 0x40f 1624; VI-NEXT: v_or_b32_e32 v2, 0x7c00, v2 1625; VI-NEXT: v_mov_b32_e32 v3, s2 1626; VI-NEXT: s_cselect_b64 vcc, -1, 0 1627; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1628; VI-NEXT: s_movk_i32 s0, 0x7fff 1629; VI-NEXT: v_mov_b32_e32 v3, s4 1630; VI-NEXT: v_bfi_b32 v2, s0, v2, v3 1631; VI-NEXT: flat_store_short v[0:1], v2 1632; VI-NEXT: s_endpgm 1633; 1634; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16: 1635; GFX9: ; %bb.0: 1636; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1637; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 1638; GFX9-NEXT: v_mov_b32_e32 v0, 0 1639; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1640; GFX9-NEXT: s_lshr_b32 s4, s3, 8 1641; GFX9-NEXT: s_and_b32 s5, s3, 0x1ff 1642; GFX9-NEXT: s_and_b32 s7, s4, 0xffe 1643; GFX9-NEXT: s_or_b32 s2, s5, s2 1644; GFX9-NEXT: s_cmp_lg_u32 s2, 0 1645; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 1646; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 1647; GFX9-NEXT: s_bfe_u32 s3, s3, 0xb0014 1648; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1649; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3 1650; GFX9-NEXT: s_or_b32 s7, s7, s2 1651; GFX9-NEXT: v_med3_i32 v1, s4, 0, 13 1652; GFX9-NEXT: s_or_b32 s2, s7, 0x1000 1653; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1654; GFX9-NEXT: s_lshr_b32 s4, s2, s4 1655; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s4 1656; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s2, v1 1657; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 1658; GFX9-NEXT: s_add_i32 s8, s3, 0xfffffc10 1659; GFX9-NEXT: v_readfirstlane_b32 s2, v1 1660; GFX9-NEXT: s_lshl_b32 s3, s8, 12 1661; GFX9-NEXT: s_or_b32 s2, s4, s2 1662; GFX9-NEXT: s_or_b32 s3, s7, s3 1663; GFX9-NEXT: s_cmp_lt_i32 s8, 1 1664; GFX9-NEXT: s_cselect_b32 s9, s2, s3 1665; GFX9-NEXT: s_and_b32 s4, s9, 7 1666; GFX9-NEXT: s_cmp_gt_i32 s4, 5 1667; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 1668; GFX9-NEXT: s_cmp_eq_u32 s4, 3 1669; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 1670; GFX9-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] 1671; GFX9-NEXT: s_lshr_b32 s4, s9, 2 1672; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 1673; GFX9-NEXT: s_addc_u32 s2, s4, 0 1674; GFX9-NEXT: s_cmp_lt_i32 s8, 31 1675; GFX9-NEXT: s_cselect_b32 s4, s2, 0x7c00 1676; GFX9-NEXT: s_cmp_lg_u32 s7, 0 1677; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 1678; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] 1679; GFX9-NEXT: v_lshlrev_b32_e32 v1, 9, v1 1680; GFX9-NEXT: s_cmpk_eq_i32 s8, 0x40f 1681; GFX9-NEXT: v_or_b32_e32 v1, 0x7c00, v1 1682; GFX9-NEXT: v_mov_b32_e32 v2, s4 1683; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 1684; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 1685; GFX9-NEXT: s_movk_i32 s2, 0x7fff 1686; GFX9-NEXT: v_mov_b32_e32 v2, s6 1687; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2 1688; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1689; GFX9-NEXT: s_endpgm 1690; 1691; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: 1692; GFX11: ; %bb.0: 1693; GFX11-NEXT: s_clause 0x1 1694; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1695; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 1696; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1697; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff 1698; GFX11-NEXT: s_lshr_b32 s6, s3, 8 1699; GFX11-NEXT: s_or_b32 s2, s5, s2 1700; GFX11-NEXT: s_and_b32 s5, s6, 0xffe 1701; GFX11-NEXT: s_cmp_lg_u32 s2, 0 1702; GFX11-NEXT: s_cselect_b32 s2, -1, 0 1703; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) 1704; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 1705; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014 1706; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2 1707; GFX11-NEXT: s_addk_i32 s2, 0xfc10 1708; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 1709; GFX11-NEXT: v_readfirstlane_b32 s3, v0 1710; GFX11-NEXT: s_lshl_b32 s7, s2, 12 1711; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1712; GFX11-NEXT: v_readfirstlane_b32 s6, v1 1713; GFX11-NEXT: s_or_b32 s3, s5, s3 1714; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1715; GFX11-NEXT: s_or_b32 s5, s3, 0x1000 1716; GFX11-NEXT: s_or_b32 s7, s3, s7 1717; GFX11-NEXT: s_lshr_b32 s6, s5, s6 1718; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1719; GFX11-NEXT: v_lshlrev_b32_e64 v0, v1, s6 1720; GFX11-NEXT: v_mov_b32_e32 v1, 0 1721; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0 1722; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo 1723; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1724; GFX11-NEXT: v_readfirstlane_b32 s5, v0 1725; GFX11-NEXT: s_or_b32 s5, s6, s5 1726; GFX11-NEXT: s_cmp_lt_i32 s2, 1 1727; GFX11-NEXT: s_cselect_b32 s5, s5, s7 1728; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 1729; GFX11-NEXT: s_and_b32 s6, s5, 7 1730; GFX11-NEXT: s_cmp_gt_i32 s6, 5 1731; GFX11-NEXT: s_cselect_b32 s7, -1, 0 1732; GFX11-NEXT: s_cmp_eq_u32 s6, 3 1733; GFX11-NEXT: s_cselect_b32 s6, -1, 0 1734; GFX11-NEXT: s_lshr_b32 s5, s5, 2 1735; GFX11-NEXT: s_or_b32 s6, s6, s7 1736; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1737; GFX11-NEXT: s_cmp_lg_u32 s6, 0 1738; GFX11-NEXT: s_addc_u32 s5, s5, 0 1739; GFX11-NEXT: s_cmp_lt_i32 s2, 31 1740; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00 1741; GFX11-NEXT: s_cmp_lg_u32 s3, 0 1742; GFX11-NEXT: s_cselect_b32 s3, -1, 0 1743; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f 1744; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 1745; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 1746; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1747; GFX11-NEXT: v_lshlrev_b32_e32 v0, 9, v0 1748; GFX11-NEXT: v_or_b32_e32 v0, 0x7c00, v0 1749; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1750; GFX11-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo 1751; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 1752; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 1753; GFX11-NEXT: s_endpgm 1754 %mag.trunc = fptrunc double %mag to half 1755 %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) 1756 store half %result, ptr addrspace(1) %arg_out 1757 ret void 1758} 1759 1760define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half> %arg_mag, <2 x half> %arg_sign) { 1761; SI-LABEL: s_copysign_v2f16: 1762; SI: ; %bb.0: 1763; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1764; SI-NEXT: s_mov_b32 s7, 0xf000 1765; SI-NEXT: s_mov_b32 s6, -1 1766; SI-NEXT: s_waitcnt lgkmcnt(0) 1767; SI-NEXT: s_lshr_b32 s4, s2, 16 1768; SI-NEXT: s_lshr_b32 s5, s3, 16 1769; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 1770; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 1771; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 1772; SI-NEXT: v_cvt_f32_f16_e32 v3, s3 1773; SI-NEXT: s_brev_b32 s2, -2 1774; SI-NEXT: v_bfi_b32 v0, s2, v0, v1 1775; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1776; SI-NEXT: v_bfi_b32 v1, s2, v2, v3 1777; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1778; SI-NEXT: s_mov_b32 s4, s0 1779; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1780; SI-NEXT: s_mov_b32 s5, s1 1781; SI-NEXT: v_or_b32_e32 v0, v1, v0 1782; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1783; SI-NEXT: s_endpgm 1784; 1785; VI-LABEL: s_copysign_v2f16: 1786; VI: ; %bb.0: 1787; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1788; VI-NEXT: s_movk_i32 s4, 0x7fff 1789; VI-NEXT: s_waitcnt lgkmcnt(0) 1790; VI-NEXT: v_mov_b32_e32 v0, s2 1791; VI-NEXT: v_mov_b32_e32 v1, s3 1792; VI-NEXT: s_lshr_b32 s3, s3, 16 1793; VI-NEXT: s_lshr_b32 s2, s2, 16 1794; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 1795; VI-NEXT: v_mov_b32_e32 v1, s2 1796; VI-NEXT: v_mov_b32_e32 v2, s3 1797; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 1798; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1799; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1800; VI-NEXT: v_mov_b32_e32 v0, s0 1801; VI-NEXT: v_mov_b32_e32 v1, s1 1802; VI-NEXT: flat_store_dword v[0:1], v2 1803; VI-NEXT: s_endpgm 1804; 1805; GFX9-LABEL: s_copysign_v2f16: 1806; GFX9: ; %bb.0: 1807; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1808; GFX9-NEXT: s_movk_i32 s4, 0x7fff 1809; GFX9-NEXT: v_mov_b32_e32 v0, 0 1810; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1811; GFX9-NEXT: v_mov_b32_e32 v1, s2 1812; GFX9-NEXT: v_mov_b32_e32 v2, s3 1813; GFX9-NEXT: s_lshr_b32 s3, s3, 16 1814; GFX9-NEXT: s_lshr_b32 s2, s2, 16 1815; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 1816; GFX9-NEXT: v_mov_b32_e32 v2, s2 1817; GFX9-NEXT: v_mov_b32_e32 v3, s3 1818; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 1819; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1820; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 1821; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1822; GFX9-NEXT: s_endpgm 1823; 1824; GFX11-LABEL: s_copysign_v2f16: 1825; GFX11: ; %bb.0: 1826; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1827; GFX11-NEXT: v_mov_b32_e32 v2, 0 1828; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1829; GFX11-NEXT: v_mov_b32_e32 v0, s3 1830; GFX11-NEXT: s_lshr_b32 s3, s3, 16 1831; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 1832; GFX11-NEXT: v_mov_b32_e32 v1, s3 1833; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 1834; GFX11-NEXT: s_lshr_b32 s2, s2, 16 1835; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) 1836; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 1837; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1838; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1839; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1840; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 1841; GFX11-NEXT: s_endpgm 1842 %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) 1843 store <2 x half> %out, ptr addrspace(1) %arg_out 1844 ret void 1845} 1846 1847define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half> %arg_mag, <3 x half> %arg_sign) { 1848; SI-LABEL: s_copysign_v3f16: 1849; SI: ; %bb.0: 1850; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 1851; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1852; SI-NEXT: s_mov_b32 s7, 0xf000 1853; SI-NEXT: s_waitcnt lgkmcnt(0) 1854; SI-NEXT: s_lshr_b32 s6, s0, 16 1855; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 1856; SI-NEXT: s_lshr_b32 s0, s2, 16 1857; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 1858; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 1859; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 1860; SI-NEXT: v_cvt_f32_f16_e32 v4, s3 1861; SI-NEXT: v_cvt_f32_f16_e32 v5, s2 1862; SI-NEXT: s_brev_b32 s0, -2 1863; SI-NEXT: v_bfi_b32 v2, s0, v2, v3 1864; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1865; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 1866; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 1867; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1868; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1869; SI-NEXT: s_mov_b32 s6, -1 1870; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1871; SI-NEXT: v_or_b32_e32 v1, v1, v2 1872; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 1873; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 1874; SI-NEXT: s_endpgm 1875; 1876; VI-LABEL: s_copysign_v3f16: 1877; VI: ; %bb.0: 1878; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1879; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1880; VI-NEXT: s_movk_i32 s6, 0x7fff 1881; VI-NEXT: s_waitcnt lgkmcnt(0) 1882; VI-NEXT: v_mov_b32_e32 v0, s0 1883; VI-NEXT: v_mov_b32_e32 v1, s2 1884; VI-NEXT: s_lshr_b32 s2, s2, 16 1885; VI-NEXT: s_lshr_b32 s0, s0, 16 1886; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 1887; VI-NEXT: v_mov_b32_e32 v1, s0 1888; VI-NEXT: v_mov_b32_e32 v2, s2 1889; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 1890; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1891; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1892; VI-NEXT: v_mov_b32_e32 v0, s1 1893; VI-NEXT: v_mov_b32_e32 v1, s3 1894; VI-NEXT: s_add_u32 s0, s4, 4 1895; VI-NEXT: v_bfi_b32 v3, s6, v0, v1 1896; VI-NEXT: s_addc_u32 s1, s5, 0 1897; VI-NEXT: v_mov_b32_e32 v0, s0 1898; VI-NEXT: v_mov_b32_e32 v1, s1 1899; VI-NEXT: flat_store_short v[0:1], v3 1900; VI-NEXT: v_mov_b32_e32 v0, s4 1901; VI-NEXT: v_mov_b32_e32 v1, s5 1902; VI-NEXT: flat_store_dword v[0:1], v2 1903; VI-NEXT: s_endpgm 1904; 1905; GFX9-LABEL: s_copysign_v3f16: 1906; GFX9: ; %bb.0: 1907; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1908; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 1909; GFX9-NEXT: s_movk_i32 s4, 0x7fff 1910; GFX9-NEXT: v_mov_b32_e32 v0, 0 1911; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1912; GFX9-NEXT: v_mov_b32_e32 v1, s0 1913; GFX9-NEXT: v_mov_b32_e32 v2, s2 1914; GFX9-NEXT: s_lshr_b32 s2, s2, 16 1915; GFX9-NEXT: s_lshr_b32 s0, s0, 16 1916; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 1917; GFX9-NEXT: v_mov_b32_e32 v2, s0 1918; GFX9-NEXT: v_mov_b32_e32 v3, s2 1919; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 1920; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 1921; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 1922; GFX9-NEXT: v_mov_b32_e32 v2, s1 1923; GFX9-NEXT: v_mov_b32_e32 v3, s3 1924; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 1925; GFX9-NEXT: global_store_short v0, v2, s[6:7] offset:4 1926; GFX9-NEXT: global_store_dword v0, v1, s[6:7] 1927; GFX9-NEXT: s_endpgm 1928; 1929; GFX11-LABEL: s_copysign_v3f16: 1930; GFX11: ; %bb.0: 1931; GFX11-NEXT: s_clause 0x1 1932; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 1933; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 1934; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1935; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 1936; GFX11-NEXT: s_lshr_b32 s2, s2, 16 1937; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) 1938; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2 1939; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 1940; GFX11-NEXT: s_lshr_b32 s0, s0, 16 1941; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) 1942; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 1943; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 1944; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 1945; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1946; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 1947; GFX11-NEXT: s_clause 0x1 1948; GFX11-NEXT: global_store_b16 v3, v2, s[4:5] offset:4 1949; GFX11-NEXT: global_store_b32 v3, v0, s[4:5] 1950; GFX11-NEXT: s_endpgm 1951 %out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign) 1952 store <3 x half> %out, ptr addrspace(1) %arg_out 1953 ret void 1954} 1955 1956define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half> %arg_mag, <4 x half> %arg_sign) { 1957; SI-LABEL: s_copysign_v4f16: 1958; SI: ; %bb.0: 1959; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 1960; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 1961; SI-NEXT: s_mov_b32 s7, 0xf000 1962; SI-NEXT: s_mov_b32 s6, -1 1963; SI-NEXT: s_waitcnt lgkmcnt(0) 1964; SI-NEXT: s_lshr_b32 s8, s0, 16 1965; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 1966; SI-NEXT: s_lshr_b32 s0, s2, 16 1967; SI-NEXT: s_lshr_b32 s9, s1, 16 1968; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 1969; SI-NEXT: s_lshr_b32 s0, s3, 16 1970; SI-NEXT: v_cvt_f32_f16_e32 v0, s8 1971; SI-NEXT: v_cvt_f32_f16_e32 v1, s9 1972; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 1973; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 1974; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 1975; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 1976; SI-NEXT: s_brev_b32 s0, -2 1977; SI-NEXT: v_bfi_b32 v1, s0, v1, v5 1978; SI-NEXT: v_bfi_b32 v0, s0, v0, v4 1979; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1980; SI-NEXT: v_bfi_b32 v3, s0, v3, v7 1981; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1982; SI-NEXT: v_bfi_b32 v2, s0, v2, v6 1983; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1984; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1985; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1986; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1987; SI-NEXT: v_or_b32_e32 v1, v3, v1 1988; SI-NEXT: v_or_b32_e32 v0, v2, v0 1989; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1990; SI-NEXT: s_endpgm 1991; 1992; VI-LABEL: s_copysign_v4f16: 1993; VI: ; %bb.0: 1994; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 1995; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 1996; VI-NEXT: s_movk_i32 s6, 0x7fff 1997; VI-NEXT: s_waitcnt lgkmcnt(0) 1998; VI-NEXT: v_mov_b32_e32 v0, s1 1999; VI-NEXT: v_mov_b32_e32 v1, s3 2000; VI-NEXT: s_lshr_b32 s3, s3, 16 2001; VI-NEXT: s_lshr_b32 s1, s1, 16 2002; VI-NEXT: v_bfi_b32 v0, s6, v0, v1 2003; VI-NEXT: v_mov_b32_e32 v1, s1 2004; VI-NEXT: v_mov_b32_e32 v2, s3 2005; VI-NEXT: v_bfi_b32 v1, s6, v1, v2 2006; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2007; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2008; VI-NEXT: v_mov_b32_e32 v0, s0 2009; VI-NEXT: v_mov_b32_e32 v2, s2 2010; VI-NEXT: s_lshr_b32 s1, s2, 16 2011; VI-NEXT: s_lshr_b32 s0, s0, 16 2012; VI-NEXT: v_bfi_b32 v0, s6, v0, v2 2013; VI-NEXT: v_mov_b32_e32 v2, s0 2014; VI-NEXT: v_mov_b32_e32 v3, s1 2015; VI-NEXT: v_bfi_b32 v2, s6, v2, v3 2016; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2017; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2018; VI-NEXT: v_mov_b32_e32 v2, s4 2019; VI-NEXT: v_mov_b32_e32 v3, s5 2020; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2021; VI-NEXT: s_endpgm 2022; 2023; GFX9-LABEL: s_copysign_v4f16: 2024; GFX9: ; %bb.0: 2025; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 2026; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 2027; GFX9-NEXT: s_movk_i32 s4, 0x7fff 2028; GFX9-NEXT: v_mov_b32_e32 v2, 0 2029; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2030; GFX9-NEXT: v_mov_b32_e32 v0, s1 2031; GFX9-NEXT: v_mov_b32_e32 v1, s3 2032; GFX9-NEXT: s_lshr_b32 s3, s3, 16 2033; GFX9-NEXT: s_lshr_b32 s1, s1, 16 2034; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 2035; GFX9-NEXT: v_mov_b32_e32 v1, s1 2036; GFX9-NEXT: v_mov_b32_e32 v3, s3 2037; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 2038; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2039; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 2040; GFX9-NEXT: v_mov_b32_e32 v0, s0 2041; GFX9-NEXT: v_mov_b32_e32 v3, s2 2042; GFX9-NEXT: s_lshr_b32 s1, s2, 16 2043; GFX9-NEXT: s_lshr_b32 s0, s0, 16 2044; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3 2045; GFX9-NEXT: v_mov_b32_e32 v3, s0 2046; GFX9-NEXT: v_mov_b32_e32 v4, s1 2047; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4 2048; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2049; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 2050; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 2051; GFX9-NEXT: s_endpgm 2052; 2053; GFX11-LABEL: s_copysign_v4f16: 2054; GFX11: ; %bb.0: 2055; GFX11-NEXT: s_clause 0x1 2056; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 2057; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 2058; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2059; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 2060; GFX11-NEXT: v_mov_b32_e32 v1, s2 2061; GFX11-NEXT: s_lshr_b32 s3, s3, 16 2062; GFX11-NEXT: s_lshr_b32 s2, s2, 16 2063; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2064; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 2065; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 2066; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 2067; GFX11-NEXT: s_lshr_b32 s6, s1, 16 2068; GFX11-NEXT: s_lshr_b32 s0, s0, 16 2069; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 2070; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 2071; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 2072; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 2073; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2074; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 2075; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 2076; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5] 2077; GFX11-NEXT: s_endpgm 2078 %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) 2079 store <4 x half> %out, ptr addrspace(1) %arg_out 2080 ret void 2081} 2082 2083attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } 2084