1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,SI 3; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 7 8define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x, float %y) #0 { 9; SI-LABEL: s_cvt_pkrtz_v2f16_f32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b32 s4, s0 16; SI-NEXT: s_mov_b32 s5, s1 17; SI-NEXT: v_mov_b32_e32 v0, s3 18; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 19; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: s_cvt_pkrtz_v2f16_f32: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: v_mov_b32_e32 v0, s3 27; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 28; VI-NEXT: v_mov_b32_e32 v0, s0 29; VI-NEXT: v_mov_b32_e32 v1, s1 30; VI-NEXT: flat_store_dword v[0:1], v2 31; VI-NEXT: s_endpgm 32; 33; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: 34; GFX9: ; %bb.0: 35; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 36; GFX9-NEXT: v_mov_b32_e32 v0, 0 37; GFX9-NEXT: s_waitcnt lgkmcnt(0) 38; GFX9-NEXT: v_mov_b32_e32 v1, s3 39; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 40; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 41; GFX9-NEXT: s_endpgm 42; 43; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: 44; GFX10: ; %bb.0: 45; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 46; GFX10-NEXT: v_mov_b32_e32 v0, 0 47; GFX10-NEXT: s_waitcnt lgkmcnt(0) 48; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 49; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 50; GFX10-NEXT: s_endpgm 51; 52; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: 53; GFX11: ; %bb.0: 54; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 55; GFX11-NEXT: v_mov_b32_e32 v0, 0 56; GFX11-NEXT: s_waitcnt lgkmcnt(0) 57; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 58; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 59; GFX11-NEXT: s_endpgm 60 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) 61 store <2 x half> %result, ptr addrspace(1) %out 62 ret void 63} 64 65define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out, float %x) #0 { 66; SI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 67; SI: ; %bb.0: 68; SI-NEXT: s_load_dword s6, s[4:5], 0xb 69; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 70; SI-NEXT: s_mov_b32 s3, 0xf000 71; SI-NEXT: s_mov_b32 s2, -1 72; SI-NEXT: s_waitcnt lgkmcnt(0) 73; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v0, s6, s6 74; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 75; SI-NEXT: s_endpgm 76; 77; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 80; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2 83; VI-NEXT: v_mov_b32_e32 v0, s0 84; VI-NEXT: v_mov_b32_e32 v1, s1 85; VI-NEXT: flat_store_dword v[0:1], v2 86; VI-NEXT: s_endpgm 87; 88; GFX9-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 89; GFX9: ; %bb.0: 90; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 91; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 92; GFX9-NEXT: v_mov_b32_e32 v0, 0 93; GFX9-NEXT: s_waitcnt lgkmcnt(0) 94; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, s2 95; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 96; GFX9-NEXT: s_endpgm 97; 98; GFX10-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 99; GFX10: ; %bb.0: 100; GFX10-NEXT: s_clause 0x1 101; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 102; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 103; GFX10-NEXT: v_mov_b32_e32 v0, 0 104; GFX10-NEXT: s_waitcnt lgkmcnt(0) 105; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s2 106; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 107; GFX10-NEXT: s_endpgm 108; 109; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32: 110; GFX11: ; %bb.0: 111; GFX11-NEXT: s_clause 0x1 112; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 113; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 114; GFX11-NEXT: v_mov_b32_e32 v0, 0 115; GFX11-NEXT: s_waitcnt lgkmcnt(0) 116; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2 117; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 118; GFX11-NEXT: s_endpgm 119 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) 120 store <2 x half> %result, ptr addrspace(1) %out 121 ret void 122} 123 124define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(ptr addrspace(1) %out) #0 { 125; GCN-LABEL: s_cvt_pkrtz_undef_undef: 126; GCN: ; %bb.0: 127; GCN-NEXT: s_endpgm 128; 129; GFX10-LABEL: s_cvt_pkrtz_undef_undef: 130; GFX10: ; %bb.0: 131; GFX10-NEXT: s_endpgm 132; 133; GFX11-LABEL: s_cvt_pkrtz_undef_undef: 134; GFX11: ; %bb.0: 135; GFX11-NEXT: s_endpgm 136 %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef) 137 store <2 x half> %result, ptr addrspace(1) %out 138 ret void 139} 140 141define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 142; SI-LABEL: v_cvt_pkrtz_v2f16_f32: 143; SI: ; %bb.0: 144; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 145; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 146; SI-NEXT: s_mov_b32 s11, 0xf000 147; SI-NEXT: s_mov_b32 s10, 0 148; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 149; SI-NEXT: v_mov_b32_e32 v1, 0 150; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 151; SI-NEXT: s_waitcnt lgkmcnt(0) 152; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 153; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 154; SI-NEXT: s_waitcnt vmcnt(0) 155; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 158; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, v2, v3 159; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 160; SI-NEXT: s_endpgm 161; 162; VI-LABEL: v_cvt_pkrtz_v2f16_f32: 163; VI: ; %bb.0: 164; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 165; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 166; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: v_mov_b32_e32 v1, s3 169; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 170; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 171; VI-NEXT: v_mov_b32_e32 v3, s5 172; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 173; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 174; VI-NEXT: flat_load_dword v5, v[0:1] glc 175; VI-NEXT: s_waitcnt vmcnt(0) 176; VI-NEXT: flat_load_dword v2, v[2:3] glc 177; VI-NEXT: s_waitcnt vmcnt(0) 178; VI-NEXT: v_mov_b32_e32 v1, s1 179; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 180; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 181; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 182; VI-NEXT: flat_store_dword v[0:1], v2 183; VI-NEXT: s_endpgm 184; 185; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: 186; GFX9: ; %bb.0: 187; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 188; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 189; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 191; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 192; GFX9-NEXT: s_waitcnt vmcnt(0) 193; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 194; GFX9-NEXT: s_waitcnt vmcnt(0) 195; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 196; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 197; GFX9-NEXT: s_endpgm 198; 199; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32: 200; GFX10: ; %bb.0: 201; GFX10-NEXT: s_clause 0x1 202; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 203; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 204; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX10-NEXT: s_waitcnt lgkmcnt(0) 206; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 207; GFX10-NEXT: s_waitcnt vmcnt(0) 208; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 209; GFX10-NEXT: s_waitcnt vmcnt(0) 210; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, v1, v2 211; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 212; GFX10-NEXT: s_endpgm 213; 214; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32: 215; GFX11: ; %bb.0: 216; GFX11-NEXT: s_clause 0x1 217; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 218; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 219; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 220; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 221; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 222; GFX11-NEXT: s_waitcnt lgkmcnt(0) 223; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 224; GFX11-NEXT: s_waitcnt vmcnt(0) 225; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 226; GFX11-NEXT: s_waitcnt vmcnt(0) 227; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2 228; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 229; GFX11-NEXT: s_endpgm 230 %tid = call i32 @llvm.amdgcn.workitem.id.x() 231 %tid.ext = sext i32 %tid to i64 232 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 233 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 234 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 235 %a = load volatile float, ptr addrspace(1) %a.gep 236 %b = load volatile float, ptr addrspace(1) %b.gep 237 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) 238 store <2 x half> %cvt, ptr addrspace(1) %out.gep 239 ret void 240} 241 242define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 243; SI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 244; SI: ; %bb.0: 245; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 246; SI-NEXT: s_mov_b32 s7, 0xf000 247; SI-NEXT: s_mov_b32 s6, 0 248; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 249; SI-NEXT: v_mov_b32_e32 v1, 0 250; SI-NEXT: s_waitcnt lgkmcnt(0) 251; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 252; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 253; SI-NEXT: s_waitcnt vmcnt(0) 254; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 255; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, 1.0 256; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 257; SI-NEXT: s_endpgm 258; 259; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 260; VI: ; %bb.0: 261; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 262; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 263; VI-NEXT: s_waitcnt lgkmcnt(0) 264; VI-NEXT: v_mov_b32_e32 v1, s3 265; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 266; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 267; VI-NEXT: flat_load_dword v3, v[0:1] glc 268; VI-NEXT: s_waitcnt vmcnt(0) 269; VI-NEXT: v_mov_b32_e32 v1, s1 270; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 271; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 272; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 273; VI-NEXT: flat_store_dword v[0:1], v2 274; VI-NEXT: s_endpgm 275; 276; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 277; GFX9: ; %bb.0: 278; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 279; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 282; GFX9-NEXT: s_waitcnt vmcnt(0) 283; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 284; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 285; GFX9-NEXT: s_endpgm 286; 287; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 288; GFX10: ; %bb.0: 289; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 290; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 291; GFX10-NEXT: s_waitcnt lgkmcnt(0) 292; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 293; GFX10-NEXT: s_waitcnt vmcnt(0) 294; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0 295; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 296; GFX10-NEXT: s_endpgm 297; 298; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: 299; GFX11: ; %bb.0: 300; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 301; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 302; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 303; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 304; GFX11-NEXT: s_waitcnt lgkmcnt(0) 305; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 306; GFX11-NEXT: s_waitcnt vmcnt(0) 307; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0 308; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 309; GFX11-NEXT: s_endpgm 310 %tid = call i32 @llvm.amdgcn.workitem.id.x() 311 %tid.ext = sext i32 %tid to i64 312 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 313 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 314 %a = load volatile float, ptr addrspace(1) %a.gep 315 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0) 316 store <2 x half> %cvt, ptr addrspace(1) %out.gep 317 ret void 318} 319 320define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr) #0 { 321; SI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 322; SI: ; %bb.0: 323; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 324; SI-NEXT: s_mov_b32 s7, 0xf000 325; SI-NEXT: s_mov_b32 s6, 0 326; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 327; SI-NEXT: v_mov_b32_e32 v1, 0 328; SI-NEXT: s_waitcnt lgkmcnt(0) 329; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 330; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 331; SI-NEXT: s_waitcnt vmcnt(0) 332; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 333; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v2, 1.0, v2 334; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 335; SI-NEXT: s_endpgm 336; 337; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 338; VI: ; %bb.0: 339; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 340; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 341; VI-NEXT: s_waitcnt lgkmcnt(0) 342; VI-NEXT: v_mov_b32_e32 v1, s3 343; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 344; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 345; VI-NEXT: flat_load_dword v3, v[0:1] glc 346; VI-NEXT: s_waitcnt vmcnt(0) 347; VI-NEXT: v_mov_b32_e32 v1, s1 348; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 349; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 350; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 351; VI-NEXT: flat_store_dword v[0:1], v2 352; VI-NEXT: s_endpgm 353; 354; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 355; GFX9: ; %bb.0: 356; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 357; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 359; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 360; GFX9-NEXT: s_waitcnt vmcnt(0) 361; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 362; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 363; GFX9-NEXT: s_endpgm 364; 365; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 366; GFX10: ; %bb.0: 367; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 368; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 369; GFX10-NEXT: s_waitcnt lgkmcnt(0) 370; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 371; GFX10-NEXT: s_waitcnt vmcnt(0) 372; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1 373; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 374; GFX10-NEXT: s_endpgm 375; 376; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: 377; GFX11: ; %bb.0: 378; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 379; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 380; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 381; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 382; GFX11-NEXT: s_waitcnt lgkmcnt(0) 383; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 384; GFX11-NEXT: s_waitcnt vmcnt(0) 385; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1 386; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 387; GFX11-NEXT: s_endpgm 388 %tid = call i32 @llvm.amdgcn.workitem.id.x() 389 %tid.ext = sext i32 %tid to i64 390 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 391 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 392 %a = load volatile float, ptr addrspace(1) %a.gep 393 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a) 394 store <2 x half> %cvt, ptr addrspace(1) %out.gep 395 ret void 396} 397 398define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 399; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 400; SI: ; %bb.0: 401; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 402; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 403; SI-NEXT: s_mov_b32 s11, 0xf000 404; SI-NEXT: s_mov_b32 s10, 0 405; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 406; SI-NEXT: v_mov_b32_e32 v1, 0 407; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 408; SI-NEXT: s_waitcnt lgkmcnt(0) 409; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 410; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 411; SI-NEXT: s_waitcnt vmcnt(0) 412; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 413; SI-NEXT: s_waitcnt vmcnt(0) 414; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 415; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, v3 416; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 417; SI-NEXT: s_endpgm 418; 419; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 420; VI: ; %bb.0: 421; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 422; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 423; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 424; VI-NEXT: s_waitcnt lgkmcnt(0) 425; VI-NEXT: v_mov_b32_e32 v1, s3 426; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 427; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 428; VI-NEXT: v_mov_b32_e32 v3, s5 429; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 430; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 431; VI-NEXT: flat_load_dword v5, v[0:1] glc 432; VI-NEXT: s_waitcnt vmcnt(0) 433; VI-NEXT: flat_load_dword v2, v[2:3] glc 434; VI-NEXT: s_waitcnt vmcnt(0) 435; VI-NEXT: v_mov_b32_e32 v1, s1 436; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 437; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 438; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 439; VI-NEXT: flat_store_dword v[0:1], v2 440; VI-NEXT: s_endpgm 441; 442; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 445; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 446; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 447; GFX9-NEXT: s_waitcnt lgkmcnt(0) 448; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 449; GFX9-NEXT: s_waitcnt vmcnt(0) 450; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 451; GFX9-NEXT: s_waitcnt vmcnt(0) 452; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 453; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 454; GFX9-NEXT: s_endpgm 455; 456; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 457; GFX10: ; %bb.0: 458; GFX10-NEXT: s_clause 0x1 459; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 460; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 461; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 462; GFX10-NEXT: s_waitcnt lgkmcnt(0) 463; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 464; GFX10-NEXT: s_waitcnt vmcnt(0) 465; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 466; GFX10-NEXT: s_waitcnt vmcnt(0) 467; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, v2 468; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 469; GFX10-NEXT: s_endpgm 470; 471; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: 472; GFX11: ; %bb.0: 473; GFX11-NEXT: s_clause 0x1 474; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 475; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 476; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 477; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 478; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 479; GFX11-NEXT: s_waitcnt lgkmcnt(0) 480; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 481; GFX11-NEXT: s_waitcnt vmcnt(0) 482; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 483; GFX11-NEXT: s_waitcnt vmcnt(0) 484; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2 485; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 486; GFX11-NEXT: s_endpgm 487 %tid = call i32 @llvm.amdgcn.workitem.id.x() 488 %tid.ext = sext i32 %tid to i64 489 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 490 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 491 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 492 %a = load volatile float, ptr addrspace(1) %a.gep 493 %b = load volatile float, ptr addrspace(1) %b.gep 494 %neg.a = fsub float -0.0, %a 495 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b) 496 store <2 x half> %cvt, ptr addrspace(1) %out.gep 497 ret void 498} 499 500define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 501; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 502; SI: ; %bb.0: 503; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 504; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 505; SI-NEXT: s_mov_b32 s11, 0xf000 506; SI-NEXT: s_mov_b32 s10, 0 507; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 508; SI-NEXT: v_mov_b32_e32 v1, 0 509; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 510; SI-NEXT: s_waitcnt lgkmcnt(0) 511; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 512; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 513; SI-NEXT: s_waitcnt vmcnt(0) 514; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 515; SI-NEXT: s_waitcnt vmcnt(0) 516; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 517; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, v2, -v3 518; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 519; SI-NEXT: s_endpgm 520; 521; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 522; VI: ; %bb.0: 523; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 524; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 525; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 526; VI-NEXT: s_waitcnt lgkmcnt(0) 527; VI-NEXT: v_mov_b32_e32 v1, s3 528; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 529; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 530; VI-NEXT: v_mov_b32_e32 v3, s5 531; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 532; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 533; VI-NEXT: flat_load_dword v5, v[0:1] glc 534; VI-NEXT: s_waitcnt vmcnt(0) 535; VI-NEXT: flat_load_dword v2, v[2:3] glc 536; VI-NEXT: s_waitcnt vmcnt(0) 537; VI-NEXT: v_mov_b32_e32 v1, s1 538; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 539; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 540; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 541; VI-NEXT: flat_store_dword v[0:1], v2 542; VI-NEXT: s_endpgm 543; 544; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 545; GFX9: ; %bb.0: 546; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 547; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 548; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 549; GFX9-NEXT: s_waitcnt lgkmcnt(0) 550; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 551; GFX9-NEXT: s_waitcnt vmcnt(0) 552; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 553; GFX9-NEXT: s_waitcnt vmcnt(0) 554; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 555; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 556; GFX9-NEXT: s_endpgm 557; 558; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 559; GFX10: ; %bb.0: 560; GFX10-NEXT: s_clause 0x1 561; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 562; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 563; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 564; GFX10-NEXT: s_waitcnt lgkmcnt(0) 565; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 566; GFX10-NEXT: s_waitcnt vmcnt(0) 567; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 568; GFX10-NEXT: s_waitcnt vmcnt(0) 569; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, -v2 570; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 571; GFX10-NEXT: s_endpgm 572; 573; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: 574; GFX11: ; %bb.0: 575; GFX11-NEXT: s_clause 0x1 576; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 577; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 578; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 579; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 580; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 581; GFX11-NEXT: s_waitcnt lgkmcnt(0) 582; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 583; GFX11-NEXT: s_waitcnt vmcnt(0) 584; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 585; GFX11-NEXT: s_waitcnt vmcnt(0) 586; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2 587; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 588; GFX11-NEXT: s_endpgm 589 %tid = call i32 @llvm.amdgcn.workitem.id.x() 590 %tid.ext = sext i32 %tid to i64 591 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 592 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 593 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 594 %a = load volatile float, ptr addrspace(1) %a.gep 595 %b = load volatile float, ptr addrspace(1) %b.gep 596 %neg.b = fsub float -0.0, %b 597 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b) 598 store <2 x half> %cvt, ptr addrspace(1) %out.gep 599 ret void 600} 601 602define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 603; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 604; SI: ; %bb.0: 605; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 606; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 607; SI-NEXT: s_mov_b32 s11, 0xf000 608; SI-NEXT: s_mov_b32 s10, 0 609; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 610; SI-NEXT: v_mov_b32_e32 v1, 0 611; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 612; SI-NEXT: s_waitcnt lgkmcnt(0) 613; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 614; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 615; SI-NEXT: s_waitcnt vmcnt(0) 616; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 617; SI-NEXT: s_waitcnt vmcnt(0) 618; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 619; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -v2, -v3 620; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 621; SI-NEXT: s_endpgm 622; 623; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 624; VI: ; %bb.0: 625; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 626; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 627; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 628; VI-NEXT: s_waitcnt lgkmcnt(0) 629; VI-NEXT: v_mov_b32_e32 v1, s3 630; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 631; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 632; VI-NEXT: v_mov_b32_e32 v3, s5 633; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 634; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 635; VI-NEXT: flat_load_dword v5, v[0:1] glc 636; VI-NEXT: s_waitcnt vmcnt(0) 637; VI-NEXT: flat_load_dword v2, v[2:3] glc 638; VI-NEXT: s_waitcnt vmcnt(0) 639; VI-NEXT: v_mov_b32_e32 v1, s1 640; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 641; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 642; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 643; VI-NEXT: flat_store_dword v[0:1], v2 644; VI-NEXT: s_endpgm 645; 646; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 647; GFX9: ; %bb.0: 648; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 649; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 650; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 652; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 653; GFX9-NEXT: s_waitcnt vmcnt(0) 654; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 655; GFX9-NEXT: s_waitcnt vmcnt(0) 656; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 657; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 658; GFX9-NEXT: s_endpgm 659; 660; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 661; GFX10: ; %bb.0: 662; GFX10-NEXT: s_clause 0x1 663; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 664; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 665; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 666; GFX10-NEXT: s_waitcnt lgkmcnt(0) 667; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 668; GFX10-NEXT: s_waitcnt vmcnt(0) 669; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 670; GFX10-NEXT: s_waitcnt vmcnt(0) 671; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -v1, -v2 672; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 673; GFX10-NEXT: s_endpgm 674; 675; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: 676; GFX11: ; %bb.0: 677; GFX11-NEXT: s_clause 0x1 678; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 679; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 680; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 681; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 682; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 683; GFX11-NEXT: s_waitcnt lgkmcnt(0) 684; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 685; GFX11-NEXT: s_waitcnt vmcnt(0) 686; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 687; GFX11-NEXT: s_waitcnt vmcnt(0) 688; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2 689; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 690; GFX11-NEXT: s_endpgm 691 %tid = call i32 @llvm.amdgcn.workitem.id.x() 692 %tid.ext = sext i32 %tid to i64 693 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 694 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 695 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 696 %a = load volatile float, ptr addrspace(1) %a.gep 697 %b = load volatile float, ptr addrspace(1) %b.gep 698 %neg.a = fsub float -0.0, %a 699 %neg.b = fsub float -0.0, %b 700 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b) 701 store <2 x half> %cvt, ptr addrspace(1) %out.gep 702 ret void 703} 704 705define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 706; SI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 707; SI: ; %bb.0: 708; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 709; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 710; SI-NEXT: s_mov_b32 s11, 0xf000 711; SI-NEXT: s_mov_b32 s10, 0 712; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 713; SI-NEXT: v_mov_b32_e32 v1, 0 714; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 715; SI-NEXT: s_waitcnt lgkmcnt(0) 716; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 717; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 718; SI-NEXT: s_waitcnt vmcnt(0) 719; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 720; SI-NEXT: s_waitcnt vmcnt(0) 721; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 722; SI-NEXT: v_cvt_pkrtz_f16_f32_e64 v2, -|v2|, -v3 723; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 724; SI-NEXT: s_endpgm 725; 726; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 727; VI: ; %bb.0: 728; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 729; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 730; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 731; VI-NEXT: s_waitcnt lgkmcnt(0) 732; VI-NEXT: v_mov_b32_e32 v1, s3 733; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 734; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 735; VI-NEXT: v_mov_b32_e32 v3, s5 736; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 737; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 738; VI-NEXT: flat_load_dword v5, v[0:1] glc 739; VI-NEXT: s_waitcnt vmcnt(0) 740; VI-NEXT: flat_load_dword v2, v[2:3] glc 741; VI-NEXT: s_waitcnt vmcnt(0) 742; VI-NEXT: v_mov_b32_e32 v1, s1 743; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 744; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 745; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 746; VI-NEXT: flat_store_dword v[0:1], v2 747; VI-NEXT: s_endpgm 748; 749; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 750; GFX9: ; %bb.0: 751; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 752; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 753; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 755; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 756; GFX9-NEXT: s_waitcnt vmcnt(0) 757; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 758; GFX9-NEXT: s_waitcnt vmcnt(0) 759; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 760; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 761; GFX9-NEXT: s_endpgm 762; 763; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 764; GFX10: ; %bb.0: 765; GFX10-NEXT: s_clause 0x1 766; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 767; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 768; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 769; GFX10-NEXT: s_waitcnt lgkmcnt(0) 770; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 771; GFX10-NEXT: s_waitcnt vmcnt(0) 772; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 773; GFX10-NEXT: s_waitcnt vmcnt(0) 774; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, -|v1|, -v2 775; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 776; GFX10-NEXT: s_endpgm 777; 778; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: 779; GFX11: ; %bb.0: 780; GFX11-NEXT: s_clause 0x1 781; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 782; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 783; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 784; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 785; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 786; GFX11-NEXT: s_waitcnt lgkmcnt(0) 787; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 788; GFX11-NEXT: s_waitcnt vmcnt(0) 789; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 790; GFX11-NEXT: s_waitcnt vmcnt(0) 791; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2 792; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 793; GFX11-NEXT: s_endpgm 794 %tid = call i32 @llvm.amdgcn.workitem.id.x() 795 %tid.ext = sext i32 %tid to i64 796 %a.gep = getelementptr inbounds float, ptr addrspace(1) %a.ptr, i64 %tid.ext 797 %b.gep = getelementptr inbounds float, ptr addrspace(1) %b.ptr, i64 %tid.ext 798 %out.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %out, i64 %tid.ext 799 %a = load volatile float, ptr addrspace(1) %a.gep 800 %b = load volatile float, ptr addrspace(1) %b.gep 801 %fabs.a = call float @llvm.fabs.f32(float %a) 802 %neg.fabs.a = fsub float -0.0, %fabs.a 803 %neg.b = fsub float -0.0, %b 804 %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b) 805 store <2 x half> %cvt, ptr addrspace(1) %out.gep 806 ret void 807} 808 809declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1 810declare float @llvm.fabs.f32(float) #1 811declare i32 @llvm.amdgcn.workitem.id.x() #1 812 813 814attributes #0 = { nounwind } 815attributes #1 = { nounwind readnone } 816