1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s 8 9define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { 10; GFX6-LABEL: cos_f16: 11; GFX6: ; %bb.0: 12; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 13; GFX6-NEXT: s_mov_b32 s7, 0xf000 14; GFX6-NEXT: s_mov_b32 s6, -1 15; GFX6-NEXT: s_mov_b32 s10, s6 16; GFX6-NEXT: s_mov_b32 s11, s7 17; GFX6-NEXT: s_waitcnt lgkmcnt(0) 18; GFX6-NEXT: s_mov_b32 s8, s2 19; GFX6-NEXT: s_mov_b32 s9, s3 20; GFX6-NEXT: buffer_load_ushort v0, off, s[8:11], 0 21; GFX6-NEXT: s_mov_b32 s4, s0 22; GFX6-NEXT: s_mov_b32 s5, s1 23; GFX6-NEXT: s_waitcnt vmcnt(0) 24; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 25; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 26; GFX6-NEXT: v_fract_f32_e32 v0, v0 27; GFX6-NEXT: v_cos_f32_e32 v0, v0 28; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 29; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 30; GFX6-NEXT: s_endpgm 31; 32; GFX8-LABEL: cos_f16: 33; GFX8: ; %bb.0: 34; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 35; GFX8-NEXT: s_waitcnt lgkmcnt(0) 36; GFX8-NEXT: v_mov_b32_e32 v0, s2 37; GFX8-NEXT: v_mov_b32_e32 v1, s3 38; GFX8-NEXT: flat_load_ushort v0, v[0:1] 39; GFX8-NEXT: v_mov_b32_e32 v1, s1 40; GFX8-NEXT: s_waitcnt vmcnt(0) 41; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 42; GFX8-NEXT: v_fract_f16_e32 v0, v0 43; GFX8-NEXT: v_cos_f16_e32 v2, v0 44; GFX8-NEXT: v_mov_b32_e32 v0, s0 45; GFX8-NEXT: flat_store_short v[0:1], v2 46; GFX8-NEXT: s_endpgm 47; 48; GFX9-LABEL: cos_f16: 49; GFX9: ; %bb.0: 50; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 51; GFX9-NEXT: v_mov_b32_e32 v0, 0 52; GFX9-NEXT: s_waitcnt lgkmcnt(0) 53; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] 54; GFX9-NEXT: s_waitcnt vmcnt(0) 55; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 56; GFX9-NEXT: v_cos_f16_e32 v1, v1 57; GFX9-NEXT: global_store_short v0, v1, s[0:1] 58; GFX9-NEXT: s_endpgm 59; 60; GFX10-LABEL: cos_f16: 61; GFX10: ; %bb.0: 62; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 63; GFX10-NEXT: v_mov_b32_e32 v0, 0 64; GFX10-NEXT: s_waitcnt lgkmcnt(0) 65; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 66; GFX10-NEXT: s_waitcnt vmcnt(0) 67; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 68; GFX10-NEXT: v_cos_f16_e32 v1, v1 69; GFX10-NEXT: global_store_short v0, v1, s[0:1] 70; GFX10-NEXT: s_endpgm 71; 72; GFX11-LABEL: cos_f16: 73; GFX11: ; %bb.0: 74; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 75; GFX11-NEXT: v_mov_b32_e32 v0, 0 76; GFX11-NEXT: s_waitcnt lgkmcnt(0) 77; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 78; GFX11-NEXT: s_waitcnt vmcnt(0) 79; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 80; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 81; GFX11-NEXT: v_cos_f16_e32 v1, v1 82; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 83; GFX11-NEXT: s_endpgm 84; 85; GFX12-LABEL: cos_f16: 86; GFX12: ; %bb.0: 87; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 88; GFX12-NEXT: v_mov_b32_e32 v0, 0 89; GFX12-NEXT: s_wait_kmcnt 0x0 90; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] 91; GFX12-NEXT: s_wait_loadcnt 0x0 92; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 93; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 94; GFX12-NEXT: v_cos_f16_e32 v1, v1 95; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] 96; GFX12-NEXT: s_endpgm 97 %a.val = load half, ptr addrspace(1) %a 98 %r.val = call half @llvm.cos.f16(half %a.val) 99 store half %r.val, ptr addrspace(1) %r 100 ret void 101} 102 103define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { 104; GFX6-LABEL: cos_v2f16: 105; GFX6: ; %bb.0: 106; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 107; GFX6-NEXT: s_mov_b32 s7, 0xf000 108; GFX6-NEXT: s_mov_b32 s6, -1 109; GFX6-NEXT: s_mov_b32 s10, s6 110; GFX6-NEXT: s_mov_b32 s11, s7 111; GFX6-NEXT: s_waitcnt lgkmcnt(0) 112; GFX6-NEXT: s_mov_b32 s8, s2 113; GFX6-NEXT: s_mov_b32 s9, s3 114; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 115; GFX6-NEXT: s_mov_b32 s4, s0 116; GFX6-NEXT: s_mov_b32 s5, s1 117; GFX6-NEXT: s_waitcnt vmcnt(0) 118; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 119; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 120; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 121; GFX6-NEXT: v_mul_f32_e32 v1, 0x3e22f983, v1 122; GFX6-NEXT: v_fract_f32_e32 v1, v1 123; GFX6-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0 124; GFX6-NEXT: v_fract_f32_e32 v0, v0 125; GFX6-NEXT: v_cos_f32_e32 v0, v0 126; GFX6-NEXT: v_cos_f32_e32 v1, v1 127; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 128; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 129; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 130; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 131; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 132; GFX6-NEXT: s_endpgm 133; 134; GFX8-LABEL: cos_v2f16: 135; GFX8: ; %bb.0: 136; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 137; GFX8-NEXT: s_waitcnt lgkmcnt(0) 138; GFX8-NEXT: v_mov_b32_e32 v0, s2 139; GFX8-NEXT: v_mov_b32_e32 v1, s3 140; GFX8-NEXT: flat_load_dword v0, v[0:1] 141; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118 142; GFX8-NEXT: s_waitcnt vmcnt(0) 143; GFX8-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 144; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0 145; GFX8-NEXT: v_fract_f16_e32 v1, v1 146; GFX8-NEXT: v_fract_f16_e32 v0, v0 147; GFX8-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 148; GFX8-NEXT: v_cos_f16_e32 v3, v0 149; GFX8-NEXT: v_mov_b32_e32 v0, s0 150; GFX8-NEXT: v_mov_b32_e32 v1, s1 151; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 152; GFX8-NEXT: flat_store_dword v[0:1], v2 153; GFX8-NEXT: s_endpgm 154; 155; GFX9-LABEL: cos_v2f16: 156; GFX9: ; %bb.0: 157; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 158; GFX9-NEXT: v_mov_b32_e32 v0, 0 159; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118 160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 161; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 162; GFX9-NEXT: s_waitcnt vmcnt(0) 163; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 164; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 165; GFX9-NEXT: v_cos_f16_e32 v2, v3 166; GFX9-NEXT: v_cos_f16_e32 v1, v1 167; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1 168; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 169; GFX9-NEXT: s_endpgm 170; 171; GFX10-LABEL: cos_v2f16: 172; GFX10: ; %bb.0: 173; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 174; GFX10-NEXT: v_mov_b32_e32 v0, 0 175; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118 176; GFX10-NEXT: s_waitcnt lgkmcnt(0) 177; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 178; GFX10-NEXT: s_waitcnt vmcnt(0) 179; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1 180; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 181; GFX10-NEXT: v_cos_f16_e32 v2, v3 182; GFX10-NEXT: v_cos_f16_e32 v1, v1 183; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1 184; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 185; GFX10-NEXT: s_endpgm 186; 187; GFX11-LABEL: cos_v2f16: 188; GFX11: ; %bb.0: 189; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 190; GFX11-NEXT: v_mov_b32_e32 v0, 0 191; GFX11-NEXT: s_waitcnt lgkmcnt(0) 192; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 193; GFX11-NEXT: s_waitcnt vmcnt(0) 194; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 195; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 196; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 197; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 198; GFX11-NEXT: v_cos_f16_e32 v1, v1 199; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 200; GFX11-NEXT: v_cos_f16_e32 v2, v2 201; GFX11-NEXT: s_waitcnt_depctr 0xfff 202; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 203; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 204; GFX11-NEXT: s_endpgm 205; 206; GFX12-LABEL: cos_v2f16: 207; GFX12: ; %bb.0: 208; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 209; GFX12-NEXT: v_mov_b32_e32 v0, 0 210; GFX12-NEXT: s_wait_kmcnt 0x0 211; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 212; GFX12-NEXT: s_wait_loadcnt 0x0 213; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 214; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 215; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 216; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 217; GFX12-NEXT: v_cos_f16_e32 v1, v1 218; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 219; GFX12-NEXT: v_cos_f16_e32 v2, v2 220; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 221; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 222; GFX12-NEXT: s_endpgm 223 %a.val = load <2 x half>, ptr addrspace(1) %a 224 %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) 225 store <2 x half> %r.val, ptr addrspace(1) %r 226 ret void 227} 228 229declare half @llvm.cos.f16(half %a) 230declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) 231