1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s 3; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s 6 7; DAGCombiner will transform: 8; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) 9; unless isFabsFree returns true 10 11define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) { 12; CI-LABEL: s_fabs_free_f16: 13; CI: ; %bb.0: 14; CI-NEXT: s_load_dword s2, s[8:9], 0x2 15; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 16; CI-NEXT: s_waitcnt lgkmcnt(0) 17; CI-NEXT: s_and_b32 s2, s2, 0x7fff 18; CI-NEXT: v_mov_b32_e32 v0, s0 19; CI-NEXT: v_mov_b32_e32 v1, s1 20; CI-NEXT: v_mov_b32_e32 v2, s2 21; CI-NEXT: flat_store_short v[0:1], v2 22; CI-NEXT: s_endpgm 23; 24; VI-LABEL: s_fabs_free_f16: 25; VI: ; %bb.0: 26; VI-NEXT: s_load_dword s2, s[8:9], 0x8 27; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 28; VI-NEXT: s_waitcnt lgkmcnt(0) 29; VI-NEXT: s_and_b32 s2, s2, 0x7fff 30; VI-NEXT: v_mov_b32_e32 v0, s0 31; VI-NEXT: v_mov_b32_e32 v1, s1 32; VI-NEXT: v_mov_b32_e32 v2, s2 33; VI-NEXT: flat_store_short v[0:1], v2 34; VI-NEXT: s_endpgm 35; 36; GFX9-LABEL: s_fabs_free_f16: 37; GFX9: ; %bb.0: 38; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 39; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 40; GFX9-NEXT: v_mov_b32_e32 v0, 0 41; GFX9-NEXT: s_waitcnt lgkmcnt(0) 42; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 43; GFX9-NEXT: v_mov_b32_e32 v1, s2 44; GFX9-NEXT: global_store_short v0, v1, s[0:1] 45; GFX9-NEXT: s_endpgm 46; 47; GFX11-LABEL: s_fabs_free_f16: 48; GFX11: ; %bb.0: 49; GFX11-NEXT: s_clause 0x1 50; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 51; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 52; GFX11-NEXT: s_waitcnt lgkmcnt(0) 53; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff 54; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 55; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 56; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 57; GFX11-NEXT: s_endpgm 58 %bc= bitcast i16 %in to half 59 %fabs = call half @llvm.fabs.f16(half %bc) 60 store half %fabs, ptr addrspace(1) %out 61 ret void 62} 63 64define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) { 65; CI-LABEL: s_fabs_f16: 66; CI: ; %bb.0: 67; CI-NEXT: s_load_dword s2, s[8:9], 0x2 68; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 69; CI-NEXT: s_waitcnt lgkmcnt(0) 70; CI-NEXT: s_and_b32 s2, s2, 0x7fff 71; CI-NEXT: v_mov_b32_e32 v0, s0 72; CI-NEXT: v_mov_b32_e32 v1, s1 73; CI-NEXT: v_mov_b32_e32 v2, s2 74; CI-NEXT: flat_store_short v[0:1], v2 75; CI-NEXT: s_endpgm 76; 77; VI-LABEL: s_fabs_f16: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dword s2, s[8:9], 0x8 80; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_and_b32 s2, s2, 0x7fff 83; VI-NEXT: v_mov_b32_e32 v0, s0 84; VI-NEXT: v_mov_b32_e32 v1, s1 85; VI-NEXT: v_mov_b32_e32 v2, s2 86; VI-NEXT: flat_store_short v[0:1], v2 87; VI-NEXT: s_endpgm 88; 89; GFX9-LABEL: s_fabs_f16: 90; GFX9: ; %bb.0: 91; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 92; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 93; GFX9-NEXT: v_mov_b32_e32 v0, 0 94; GFX9-NEXT: s_waitcnt lgkmcnt(0) 95; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff 96; GFX9-NEXT: v_mov_b32_e32 v1, s2 97; GFX9-NEXT: global_store_short v0, v1, s[0:1] 98; GFX9-NEXT: s_endpgm 99; 100; GFX11-LABEL: s_fabs_f16: 101; GFX11: ; %bb.0: 102; GFX11-NEXT: s_clause 0x1 103; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 104; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 105; GFX11-NEXT: s_waitcnt lgkmcnt(0) 106; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff 107; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 108; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 109; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 110; GFX11-NEXT: s_endpgm 111 %fabs = call half @llvm.fabs.f16(half %in) 112 store half %fabs, ptr addrspace(1) %out 113 ret void 114} 115 116define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) { 117; CI-LABEL: s_fabs_v2f16: 118; CI: ; %bb.0: 119; CI-NEXT: s_load_dword s2, s[8:9], 0x2 120; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 121; CI-NEXT: s_waitcnt lgkmcnt(0) 122; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 123; CI-NEXT: v_mov_b32_e32 v0, s0 124; CI-NEXT: v_mov_b32_e32 v1, s1 125; CI-NEXT: v_mov_b32_e32 v2, s2 126; CI-NEXT: flat_store_dword v[0:1], v2 127; CI-NEXT: s_endpgm 128; 129; VI-LABEL: s_fabs_v2f16: 130; VI: ; %bb.0: 131; VI-NEXT: s_load_dword s2, s[8:9], 0x8 132; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 133; VI-NEXT: s_waitcnt lgkmcnt(0) 134; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 135; VI-NEXT: v_mov_b32_e32 v0, s0 136; VI-NEXT: v_mov_b32_e32 v1, s1 137; VI-NEXT: v_mov_b32_e32 v2, s2 138; VI-NEXT: flat_store_dword v[0:1], v2 139; VI-NEXT: s_endpgm 140; 141; GFX9-LABEL: s_fabs_v2f16: 142; GFX9: ; %bb.0: 143; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 144; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 145; GFX9-NEXT: v_mov_b32_e32 v0, 0 146; GFX9-NEXT: s_waitcnt lgkmcnt(0) 147; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff 148; GFX9-NEXT: v_mov_b32_e32 v1, s2 149; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 150; GFX9-NEXT: s_endpgm 151; 152; GFX11-LABEL: s_fabs_v2f16: 153; GFX11: ; %bb.0: 154; GFX11-NEXT: s_clause 0x1 155; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 156; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 157; GFX11-NEXT: s_waitcnt lgkmcnt(0) 158; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff 159; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 160; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 161; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 162; GFX11-NEXT: s_endpgm 163 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) 164 store <2 x half> %fabs, ptr addrspace(1) %out 165 ret void 166} 167 168define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { 169; CI-LABEL: s_fabs_v4f16: 170; CI: ; %bb.0: 171; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 172; CI-NEXT: s_waitcnt lgkmcnt(0) 173; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff 174; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 175; CI-NEXT: v_mov_b32_e32 v3, s1 176; CI-NEXT: v_mov_b32_e32 v0, s2 177; CI-NEXT: v_mov_b32_e32 v1, s3 178; CI-NEXT: v_mov_b32_e32 v2, s0 179; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 180; CI-NEXT: s_endpgm 181; 182; VI-LABEL: s_fabs_v4f16: 183; VI: ; %bb.0: 184; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 185; VI-NEXT: s_waitcnt lgkmcnt(0) 186; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff 187; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 188; VI-NEXT: v_mov_b32_e32 v3, s1 189; VI-NEXT: v_mov_b32_e32 v0, s2 190; VI-NEXT: v_mov_b32_e32 v1, s3 191; VI-NEXT: v_mov_b32_e32 v2, s0 192; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 193; VI-NEXT: s_endpgm 194; 195; GFX9-LABEL: s_fabs_v4f16: 196; GFX9: ; %bb.0: 197; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 198; GFX9-NEXT: v_mov_b32_e32 v2, 0 199; GFX9-NEXT: s_waitcnt lgkmcnt(0) 200; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff 201; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff 202; GFX9-NEXT: v_mov_b32_e32 v0, s2 203; GFX9-NEXT: v_mov_b32_e32 v1, s3 204; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 205; GFX9-NEXT: s_endpgm 206; 207; GFX11-LABEL: s_fabs_v4f16: 208; GFX11: ; %bb.0: 209; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 210; GFX11-NEXT: s_waitcnt lgkmcnt(0) 211; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff 212; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff 213; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 214; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 215; GFX11-NEXT: v_mov_b32_e32 v0, s2 216; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 217; GFX11-NEXT: s_endpgm 218 %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) 219 store <4 x half> %fabs, ptr addrspace(1) %out 220 ret void 221} 222 223define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half %in1) { 224; CI-LABEL: fabs_fold_f16: 225; CI: ; %bb.0: 226; CI-NEXT: s_load_dword s0, s[8:9], 0x2 227; CI-NEXT: s_waitcnt lgkmcnt(0) 228; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0| 229; CI-NEXT: s_lshr_b32 s0, s0, 16 230; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 231; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 232; CI-NEXT: v_mul_f32_e32 v0, v0, v1 233; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 234; CI-NEXT: s_waitcnt lgkmcnt(0) 235; CI-NEXT: v_mov_b32_e32 v0, s0 236; CI-NEXT: v_mov_b32_e32 v1, s1 237; CI-NEXT: flat_store_short v[0:1], v2 238; CI-NEXT: s_endpgm 239; 240; VI-LABEL: fabs_fold_f16: 241; VI: ; %bb.0: 242; VI-NEXT: s_load_dword s2, s[8:9], 0x8 243; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 244; VI-NEXT: s_waitcnt lgkmcnt(0) 245; VI-NEXT: s_lshr_b32 s3, s2, 16 246; VI-NEXT: v_mov_b32_e32 v0, s3 247; VI-NEXT: v_mul_f16_e64 v2, |s2|, v0 248; VI-NEXT: v_mov_b32_e32 v0, s0 249; VI-NEXT: v_mov_b32_e32 v1, s1 250; VI-NEXT: flat_store_short v[0:1], v2 251; VI-NEXT: s_endpgm 252; 253; GFX9-LABEL: fabs_fold_f16: 254; GFX9: ; %bb.0: 255; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 256; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 257; GFX9-NEXT: v_mov_b32_e32 v0, 0 258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 259; GFX9-NEXT: s_lshr_b32 s3, s2, 16 260; GFX9-NEXT: v_mov_b32_e32 v1, s3 261; GFX9-NEXT: v_mul_f16_e64 v1, |s2|, v1 262; GFX9-NEXT: global_store_short v0, v1, s[0:1] 263; GFX9-NEXT: s_endpgm 264; 265; GFX11-LABEL: fabs_fold_f16: 266; GFX11: ; %bb.0: 267; GFX11-NEXT: s_clause 0x1 268; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 269; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 270; GFX11-NEXT: v_mov_b32_e32 v0, 0 271; GFX11-NEXT: s_waitcnt lgkmcnt(0) 272; GFX11-NEXT: s_lshr_b32 s3, s2, 16 273; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 274; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3 275; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 276; GFX11-NEXT: s_endpgm 277 %fabs = call half @llvm.fabs.f16(half %in0) 278 %fmul = fmul half %fabs, %in1 279 store half %fmul, ptr addrspace(1) %out 280 ret void 281} 282 283define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 284; CI-LABEL: v_fabs_v2f16: 285; CI: ; %bb.0: 286; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 287; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 288; CI-NEXT: s_waitcnt lgkmcnt(0) 289; CI-NEXT: v_mov_b32_e32 v1, s1 290; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 291; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 292; CI-NEXT: flat_load_dword v2, v[0:1] 293; CI-NEXT: s_waitcnt vmcnt(0) 294; CI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 295; CI-NEXT: flat_store_dword v[0:1], v2 296; CI-NEXT: s_endpgm 297; 298; VI-LABEL: v_fabs_v2f16: 299; VI: ; %bb.0: 300; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 301; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 302; VI-NEXT: s_waitcnt lgkmcnt(0) 303; VI-NEXT: v_mov_b32_e32 v1, s1 304; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 305; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 306; VI-NEXT: flat_load_dword v2, v[0:1] 307; VI-NEXT: s_waitcnt vmcnt(0) 308; VI-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 309; VI-NEXT: flat_store_dword v[0:1], v2 310; VI-NEXT: s_endpgm 311; 312; GFX9-LABEL: v_fabs_v2f16: 313; GFX9: ; %bb.0: 314; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 315; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 316; GFX9-NEXT: s_waitcnt lgkmcnt(0) 317; GFX9-NEXT: global_load_dword v1, v0, s[0:1] 318; GFX9-NEXT: s_waitcnt vmcnt(0) 319; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 320; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 321; GFX9-NEXT: s_endpgm 322; 323; GFX11-LABEL: v_fabs_v2f16: 324; GFX11: ; %bb.0: 325; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 326; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 327; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 328; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 329; GFX11-NEXT: s_waitcnt lgkmcnt(0) 330; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 331; GFX11-NEXT: s_waitcnt vmcnt(0) 332; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 333; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 334; GFX11-NEXT: s_endpgm 335 %tid = call i32 @llvm.amdgcn.workitem.id.x() 336 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid 337 %gep.out = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid 338 %val = load <2 x half>, ptr addrspace(1) %gep.in, align 2 339 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 340 store <2 x half> %fabs, ptr addrspace(1) %gep.out 341 ret void 342} 343 344define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 { 345; CI-LABEL: fabs_free_v2f16: 346; CI: ; %bb.0: 347; CI-NEXT: s_load_dword s2, s[8:9], 0x2 348; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 349; CI-NEXT: s_waitcnt lgkmcnt(0) 350; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 351; CI-NEXT: v_mov_b32_e32 v0, s0 352; CI-NEXT: v_mov_b32_e32 v1, s1 353; CI-NEXT: v_mov_b32_e32 v2, s2 354; CI-NEXT: flat_store_dword v[0:1], v2 355; CI-NEXT: s_endpgm 356; 357; VI-LABEL: fabs_free_v2f16: 358; VI: ; %bb.0: 359; VI-NEXT: s_load_dword s2, s[8:9], 0x8 360; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff 363; VI-NEXT: v_mov_b32_e32 v0, s0 364; VI-NEXT: v_mov_b32_e32 v1, s1 365; VI-NEXT: v_mov_b32_e32 v2, s2 366; VI-NEXT: flat_store_dword v[0:1], v2 367; VI-NEXT: s_endpgm 368; 369; GFX9-LABEL: fabs_free_v2f16: 370; GFX9: ; %bb.0: 371; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8 372; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 373; GFX9-NEXT: v_mov_b32_e32 v0, 0 374; GFX9-NEXT: s_waitcnt lgkmcnt(0) 375; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff 376; GFX9-NEXT: v_mov_b32_e32 v1, s2 377; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 378; GFX9-NEXT: s_endpgm 379; 380; GFX11-LABEL: fabs_free_v2f16: 381; GFX11: ; %bb.0: 382; GFX11-NEXT: s_clause 0x1 383; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 384; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 385; GFX11-NEXT: s_waitcnt lgkmcnt(0) 386; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff 387; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 388; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 389; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 390; GFX11-NEXT: s_endpgm 391 %bc = bitcast i32 %in to <2 x half> 392 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) 393 store <2 x half> %fabs, ptr addrspace(1) %out 394 ret void 395} 396 397; FIXME: Should do fabs after conversion to avoid converting multiple 398; times in this particular case. 399define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 400; CI-LABEL: v_fabs_fold_self_v2f16: 401; CI: ; %bb.0: 402; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 403; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 404; CI-NEXT: s_waitcnt lgkmcnt(0) 405; CI-NEXT: v_mov_b32_e32 v1, s3 406; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 407; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 408; CI-NEXT: flat_load_dword v0, v[0:1] 409; CI-NEXT: s_waitcnt vmcnt(0) 410; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 411; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 412; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 413; CI-NEXT: v_cvt_f32_f16_e32 v3, v0 414; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 415; CI-NEXT: v_mul_f32_e32 v1, v1, v2 416; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 417; CI-NEXT: v_mul_f32_e32 v0, v0, v3 418; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 419; CI-NEXT: v_mov_b32_e32 v0, s0 420; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 421; CI-NEXT: v_mov_b32_e32 v1, s1 422; CI-NEXT: v_or_b32_e32 v2, v3, v2 423; CI-NEXT: flat_store_dword v[0:1], v2 424; CI-NEXT: s_endpgm 425; 426; VI-LABEL: v_fabs_fold_self_v2f16: 427; VI: ; %bb.0: 428; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 429; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: v_mov_b32_e32 v1, s3 432; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 433; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 434; VI-NEXT: flat_load_dword v2, v[0:1] 435; VI-NEXT: v_mov_b32_e32 v0, s0 436; VI-NEXT: v_mov_b32_e32 v1, s1 437; VI-NEXT: s_waitcnt vmcnt(0) 438; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 439; VI-NEXT: v_mul_f16_e64 v2, |v2|, v2 440; VI-NEXT: v_or_b32_e32 v2, v2, v3 441; VI-NEXT: flat_store_dword v[0:1], v2 442; VI-NEXT: s_endpgm 443; 444; GFX9-LABEL: v_fabs_fold_self_v2f16: 445; GFX9: ; %bb.0: 446; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 447; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 448; GFX9-NEXT: v_mov_b32_e32 v1, 0 449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 451; GFX9-NEXT: s_waitcnt vmcnt(0) 452; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0 453; GFX9-NEXT: v_pk_mul_f16 v0, v2, v0 454; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 455; GFX9-NEXT: s_endpgm 456; 457; GFX11-LABEL: v_fabs_fold_self_v2f16: 458; GFX11: ; %bb.0: 459; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 460; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 461; GFX11-NEXT: v_mov_b32_e32 v2, 0 462; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) 463; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 464; GFX11-NEXT: s_waitcnt lgkmcnt(0) 465; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 466; GFX11-NEXT: s_waitcnt vmcnt(0) 467; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 468; GFX11-NEXT: v_pk_mul_f16 v0, v1, v0 469; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 470; GFX11-NEXT: s_endpgm 471 %tid = call i32 @llvm.amdgcn.workitem.id.x() 472 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid 473 %val = load <2 x half>, ptr addrspace(1) %gep 474 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 475 %fmul = fmul <2 x half> %fabs, %val 476 store <2 x half> %fmul, ptr addrspace(1) %out 477 ret void 478} 479 480define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %other.val) #0 { 481; CI-LABEL: v_fabs_fold_v2f16: 482; CI: ; %bb.0: 483; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 484; CI-NEXT: s_load_dword s4, s[8:9], 0x4 485; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 486; CI-NEXT: s_waitcnt lgkmcnt(0) 487; CI-NEXT: v_mov_b32_e32 v1, s3 488; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 489; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 490; CI-NEXT: flat_load_dword v0, v[0:1] 491; CI-NEXT: s_lshr_b32 s2, s4, 16 492; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 493; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 494; CI-NEXT: s_waitcnt vmcnt(0) 495; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 496; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2| 497; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 498; CI-NEXT: v_mul_f32_e32 v1, v2, v1 499; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 500; CI-NEXT: v_mul_f32_e32 v0, v0, v3 501; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 502; CI-NEXT: v_mov_b32_e32 v0, s0 503; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 504; CI-NEXT: v_mov_b32_e32 v1, s1 505; CI-NEXT: v_or_b32_e32 v2, v3, v2 506; CI-NEXT: flat_store_dword v[0:1], v2 507; CI-NEXT: s_endpgm 508; 509; VI-LABEL: v_fabs_fold_v2f16: 510; VI: ; %bb.0: 511; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 512; VI-NEXT: s_load_dword s4, s[8:9], 0x10 513; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 514; VI-NEXT: s_waitcnt lgkmcnt(0) 515; VI-NEXT: v_mov_b32_e32 v1, s3 516; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 517; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 518; VI-NEXT: flat_load_dword v2, v[0:1] 519; VI-NEXT: v_mov_b32_e32 v0, s0 520; VI-NEXT: s_lshr_b32 s0, s4, 16 521; VI-NEXT: v_mov_b32_e32 v3, s0 522; VI-NEXT: v_mov_b32_e32 v1, s1 523; VI-NEXT: s_waitcnt vmcnt(0) 524; VI-NEXT: v_mul_f16_sdwa v3, |v2|, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 525; VI-NEXT: v_mul_f16_e64 v2, |v2|, s4 526; VI-NEXT: v_or_b32_e32 v2, v2, v3 527; VI-NEXT: flat_store_dword v[0:1], v2 528; VI-NEXT: s_endpgm 529; 530; GFX9-LABEL: v_fabs_fold_v2f16: 531; GFX9: ; %bb.0: 532; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 533; GFX9-NEXT: s_load_dword s4, s[8:9], 0x10 534; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 535; GFX9-NEXT: v_mov_b32_e32 v1, 0 536; GFX9-NEXT: s_waitcnt lgkmcnt(0) 537; GFX9-NEXT: global_load_dword v0, v0, s[2:3] 538; GFX9-NEXT: s_waitcnt vmcnt(0) 539; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 540; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4 541; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 542; GFX9-NEXT: s_endpgm 543; 544; GFX11-LABEL: v_fabs_fold_v2f16: 545; GFX11: ; %bb.0: 546; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 547; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 548; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 549; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 550; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 551; GFX11-NEXT: s_waitcnt lgkmcnt(0) 552; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 553; GFX11-NEXT: s_waitcnt vmcnt(0) 554; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 555; GFX11-NEXT: v_pk_mul_f16 v0, v0, s4 556; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 557; GFX11-NEXT: s_endpgm 558 %tid = call i32 @llvm.amdgcn.workitem.id.x() 559 %gep = getelementptr <2 x half>, ptr addrspace(1) %in, i32 %tid 560 %val = load <2 x half>, ptr addrspace(1) %gep 561 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 562 %other.val.cvt = bitcast i32 %other.val to <2 x half> 563 %fmul = fmul <2 x half> %fabs, %other.val.cvt 564 store <2 x half> %fmul, ptr addrspace(1) %out 565 ret void 566} 567 568define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 { 569; CI-LABEL: v_extract_fabs_fold_v2f16: 570; CI: ; %bb.0: 571; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 572; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 573; CI-NEXT: s_waitcnt lgkmcnt(0) 574; CI-NEXT: v_mov_b32_e32 v1, s1 575; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 576; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 577; CI-NEXT: flat_load_dword v0, v[0:1] 578; CI-NEXT: s_waitcnt vmcnt(0) 579; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 580; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| 581; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| 582; CI-NEXT: v_mul_f32_e32 v0, 4.0, v0 583; CI-NEXT: v_add_f32_e32 v1, 2.0, v1 584; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 585; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 586; CI-NEXT: flat_store_short v[0:1], v0 587; CI-NEXT: s_waitcnt vmcnt(0) 588; CI-NEXT: flat_store_short v[0:1], v1 589; CI-NEXT: s_waitcnt vmcnt(0) 590; CI-NEXT: s_endpgm 591; 592; VI-LABEL: v_extract_fabs_fold_v2f16: 593; VI: ; %bb.0: 594; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 595; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 596; VI-NEXT: s_waitcnt lgkmcnt(0) 597; VI-NEXT: v_mov_b32_e32 v1, s1 598; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 599; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 600; VI-NEXT: flat_load_dword v0, v[0:1] 601; VI-NEXT: v_mov_b32_e32 v1, 0x4000 602; VI-NEXT: s_waitcnt vmcnt(0) 603; VI-NEXT: v_mul_f16_e64 v2, |v0|, 4.0 604; VI-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 605; VI-NEXT: flat_store_short v[0:1], v2 606; VI-NEXT: s_waitcnt vmcnt(0) 607; VI-NEXT: flat_store_short v[0:1], v0 608; VI-NEXT: s_waitcnt vmcnt(0) 609; VI-NEXT: s_endpgm 610; 611; GFX9-LABEL: v_extract_fabs_fold_v2f16: 612; GFX9: ; %bb.0: 613; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 614; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 615; GFX9-NEXT: v_mov_b32_e32 v1, 0x4000 616; GFX9-NEXT: s_waitcnt lgkmcnt(0) 617; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 618; GFX9-NEXT: s_waitcnt vmcnt(0) 619; GFX9-NEXT: v_mul_f16_e64 v2, |v0|, 4.0 620; GFX9-NEXT: v_add_f16_sdwa v0, |v0|, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 621; GFX9-NEXT: global_store_short v[0:1], v2, off 622; GFX9-NEXT: s_waitcnt vmcnt(0) 623; GFX9-NEXT: global_store_short v[0:1], v0, off 624; GFX9-NEXT: s_waitcnt vmcnt(0) 625; GFX9-NEXT: s_endpgm 626; 627; GFX11-LABEL: v_extract_fabs_fold_v2f16: 628; GFX11: ; %bb.0: 629; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 630; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 631; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 632; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 633; GFX11-NEXT: s_waitcnt lgkmcnt(0) 634; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 635; GFX11-NEXT: s_waitcnt vmcnt(0) 636; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 637; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0 638; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 639; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0 640; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc 641; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 642; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc 643; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 644; GFX11-NEXT: s_endpgm 645 %tid = call i32 @llvm.amdgcn.workitem.id.x() 646 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid 647 %val = load <2 x half>, ptr addrspace(1) %gep.in 648 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 649 %elt0 = extractelement <2 x half> %fabs, i32 0 650 %elt1 = extractelement <2 x half> %fabs, i32 1 651 652 %fmul0 = fmul half %elt0, 4.0 653 %fadd1 = fadd half %elt1, 2.0 654 store volatile half %fmul0, ptr addrspace(1) undef 655 store volatile half %fadd1, ptr addrspace(1) undef 656 ret void 657} 658 659define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0 { 660; CI-LABEL: v_extract_fabs_no_fold_v2f16: 661; CI: ; %bb.0: 662; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 663; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 664; CI-NEXT: s_waitcnt lgkmcnt(0) 665; CI-NEXT: v_mov_b32_e32 v1, s1 666; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 667; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 668; CI-NEXT: flat_load_dword v0, v[0:1] 669; CI-NEXT: s_waitcnt vmcnt(0) 670; CI-NEXT: v_bfe_u32 v1, v0, 16, 15 671; CI-NEXT: v_and_b32_e32 v0, 0x7fff, v0 672; CI-NEXT: flat_store_short v[0:1], v0 673; CI-NEXT: s_waitcnt vmcnt(0) 674; CI-NEXT: flat_store_short v[0:1], v1 675; CI-NEXT: s_waitcnt vmcnt(0) 676; CI-NEXT: s_endpgm 677; 678; VI-LABEL: v_extract_fabs_no_fold_v2f16: 679; VI: ; %bb.0: 680; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 681; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 682; VI-NEXT: s_waitcnt lgkmcnt(0) 683; VI-NEXT: v_mov_b32_e32 v1, s1 684; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 685; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 686; VI-NEXT: flat_load_dword v0, v[0:1] 687; VI-NEXT: s_waitcnt vmcnt(0) 688; VI-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 689; VI-NEXT: v_bfe_u32 v0, v0, 16, 15 690; VI-NEXT: flat_store_short v[0:1], v1 691; VI-NEXT: s_waitcnt vmcnt(0) 692; VI-NEXT: flat_store_short v[0:1], v0 693; VI-NEXT: s_waitcnt vmcnt(0) 694; VI-NEXT: s_endpgm 695; 696; GFX9-LABEL: v_extract_fabs_no_fold_v2f16: 697; GFX9: ; %bb.0: 698; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 699; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 700; GFX9-NEXT: s_waitcnt lgkmcnt(0) 701; GFX9-NEXT: global_load_dword v0, v0, s[0:1] 702; GFX9-NEXT: s_waitcnt vmcnt(0) 703; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 704; GFX9-NEXT: global_store_short v[0:1], v0, off 705; GFX9-NEXT: s_waitcnt vmcnt(0) 706; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off 707; GFX9-NEXT: s_waitcnt vmcnt(0) 708; GFX9-NEXT: s_endpgm 709; 710; GFX11-LABEL: v_extract_fabs_no_fold_v2f16: 711; GFX11: ; %bb.0: 712; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 713; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 715; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 716; GFX11-NEXT: s_waitcnt lgkmcnt(0) 717; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 718; GFX11-NEXT: s_waitcnt vmcnt(0) 719; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 720; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc 721; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 722; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc 723; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 724; GFX11-NEXT: s_endpgm 725 %tid = call i32 @llvm.amdgcn.workitem.id.x() 726 %gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid 727 %val = load <2 x half>, ptr addrspace(1) %gep.in 728 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) 729 %elt0 = extractelement <2 x half> %fabs, i32 0 730 %elt1 = extractelement <2 x half> %fabs, i32 1 731 store volatile half %elt0, ptr addrspace(1) undef 732 store volatile half %elt1, ptr addrspace(1) undef 733 ret void 734} 735 736declare half @llvm.fabs.f16(half) #1 737declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 738declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 739declare i32 @llvm.amdgcn.workitem.id.x() #1 740 741attributes #0 = { nounwind } 742attributes #1 = { nounwind readnone } 743 744