1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s 4declare i32 @llvm.amdgcn.workitem.id.x() 5 6; A 64-bit multiplication where no arguments were zero extended. 7define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 8; GFX10-LABEL: v_mul_i64_no_zext: 9; GFX10: ; %bb.0: 10; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 11; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0 12; GFX10-NEXT: s_waitcnt lgkmcnt(0) 13; GFX10-NEXT: s_clause 0x1 14; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1] 15; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3] 16; GFX10-NEXT: s_waitcnt vmcnt(0) 17; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0 18; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6] 19; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6] 20; GFX10-NEXT: v_mov_b32_e32 v5, v0 21; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3] 22; GFX10-NEXT: s_endpgm 23; 24; GFX11-LABEL: v_mul_i64_no_zext: 25; GFX11: ; %bb.0: 26; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 27; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 28; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 29; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0 30; GFX11-NEXT: s_waitcnt lgkmcnt(0) 31; GFX11-NEXT: s_clause 0x1 32; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1] 33; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3] 34; GFX11-NEXT: s_waitcnt vmcnt(0) 35; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0 36; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 37; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6] 38; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7] 39; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 40; GFX11-NEXT: v_mov_b32_e32 v5, v7 41; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3] 42; GFX11-NEXT: s_endpgm 43 %tid = call i32 @llvm.amdgcn.workitem.id.x() 44 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 45 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 46 %gep.out = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 47 %a = load i64, ptr addrspace(1) %gep.a 48 %b = load i64, ptr addrspace(1) %gep.b 49 %mul = mul i64 %a, %b 50 store i64 %mul, ptr addrspace(1) %gep.out 51 ret void 52} 53 54; a 64 bit multiplication where the second argument was zero extended. 55define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 56; GFX10-LABEL: v_mul_i64_zext_src1: 57; GFX10: ; %bb.0: 58; GFX10-NEXT: s_clause 0x1 59; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 60; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 61; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 62; GFX10-NEXT: v_lshlrev_b32_e32 v3, 2, v0 63; GFX10-NEXT: s_waitcnt lgkmcnt(0) 64; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 65; GFX10-NEXT: global_load_dword v4, v3, s[6:7] 66; GFX10-NEXT: s_waitcnt vmcnt(0) 67; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v0, v4, 0 68; GFX10-NEXT: v_mov_b32_e32 v0, v3 69; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1] 70; GFX10-NEXT: v_mov_b32_e32 v3, v0 71; GFX10-NEXT: v_mov_b32_e32 v0, 0 72; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] 73; GFX10-NEXT: s_endpgm 74; 75; GFX11-LABEL: v_mul_i64_zext_src1: 76; GFX11: ; %bb.0: 77; GFX11-NEXT: s_clause 0x1 78; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 79; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 80; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 81; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 82; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 83; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 84; GFX11-NEXT: s_waitcnt lgkmcnt(0) 85; GFX11-NEXT: global_load_b64 v[0:1], v1, s[2:3] 86; GFX11-NEXT: global_load_b32 v5, v2, s[4:5] 87; GFX11-NEXT: s_waitcnt vmcnt(0) 88; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0 89; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 90; GFX11-NEXT: v_mov_b32_e32 v0, v3 91; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1] 92; GFX11-NEXT: v_mov_b32_e32 v0, 0 93; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] 94; GFX11-NEXT: s_endpgm 95 %tid = call i32 @llvm.amdgcn.workitem.id.x() 96 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 97 %gep.b = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid 98 %a = load i64, ptr addrspace(1) %gep.a 99 %b = load i32, ptr addrspace(1) %gep.b 100 %b_ext = zext i32 %b to i64 101 %mul = mul i64 %a, %b_ext 102 store i64 %mul, ptr addrspace(1) %out 103 ret void 104} 105 106; 64 bit multiplication where the first argument was zero extended. 107define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 108; GFX10-LABEL: v_mul_i64_zext_src0: 109; GFX10: ; %bb.0: 110; GFX10-NEXT: s_clause 0x1 111; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 112; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 113; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 114; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 115; GFX10-NEXT: s_waitcnt lgkmcnt(0) 116; GFX10-NEXT: global_load_dword v4, v2, s[2:3] 117; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] 118; GFX10-NEXT: s_waitcnt vmcnt(0) 119; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 120; GFX10-NEXT: v_mov_b32_e32 v0, v3 121; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] 122; GFX10-NEXT: v_mov_b32_e32 v3, v0 123; GFX10-NEXT: v_mov_b32_e32 v0, 0 124; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] 125; GFX10-NEXT: s_endpgm 126; 127; GFX11-LABEL: v_mul_i64_zext_src0: 128; GFX11: ; %bb.0: 129; GFX11-NEXT: s_clause 0x1 130; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 131; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 132; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 134; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 135; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 136; GFX11-NEXT: s_waitcnt lgkmcnt(0) 137; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] 138; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] 139; GFX11-NEXT: s_waitcnt vmcnt(0) 140; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 141; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 142; GFX11-NEXT: v_mov_b32_e32 v0, v3 143; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] 144; GFX11-NEXT: v_mov_b32_e32 v0, 0 145; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] 146; GFX11-NEXT: s_endpgm 147 %tid = call i32 @llvm.amdgcn.workitem.id.x() 148 %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid 149 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 150 %a = load i32, ptr addrspace(1) %gep.a 151 %b = load i64, ptr addrspace(1) %gep.b 152 %a_ext = zext i32 %a to i64 153 %mul = mul i64 %a_ext, %b 154 store i64 %mul, ptr addrspace(1) %out 155 ret void 156} 157 158; 64-bit multiplication where both arguments were zero extended. 159define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 160; GFX10-LABEL: v_mul_i64_zext_src0_src1: 161; GFX10: ; %bb.0: 162; GFX10-NEXT: s_clause 0x1 163; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 164; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 165; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 166; GFX10-NEXT: s_waitcnt lgkmcnt(0) 167; GFX10-NEXT: s_clause 0x1 168; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 169; GFX10-NEXT: global_load_dword v2, v0, s[6:7] 170; GFX10-NEXT: s_waitcnt vmcnt(0) 171; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0 172; GFX10-NEXT: v_mov_b32_e32 v2, 0 173; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 174; GFX10-NEXT: s_endpgm 175; 176; GFX11-LABEL: v_mul_i64_zext_src0_src1: 177; GFX11: ; %bb.0: 178; GFX11-NEXT: s_clause 0x1 179; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 180; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 181; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 182; GFX11-NEXT: v_mov_b32_e32 v2, 0 183; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 184; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 185; GFX11-NEXT: s_waitcnt lgkmcnt(0) 186; GFX11-NEXT: s_clause 0x1 187; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 188; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] 189; GFX11-NEXT: s_waitcnt vmcnt(0) 190; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0 191; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 192; GFX11-NEXT: s_endpgm 193 %tid = call i32 @llvm.amdgcn.workitem.id.x() 194 %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid 195 %gep.b = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid 196 %a = load i32, ptr addrspace(1) %gep.a 197 %b = load i32, ptr addrspace(1) %gep.b 198 %a_ext = zext i32 %a to i64 199 %b_ext = zext i32 %b to i64 200 %mul = mul i64 %a_ext, %b_ext 201 store i64 %mul, ptr addrspace(1) %out 202 ret void 203} 204 205; 64-bit multiplication where the upper bytes of the first argument were masked. 206define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 207; GFX10-LABEL: v_mul_i64_masked_src0_hi: 208; GFX10: ; %bb.0: 209; GFX10-NEXT: s_clause 0x1 210; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 211; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 212; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 213; GFX10-NEXT: s_waitcnt lgkmcnt(0) 214; GFX10-NEXT: s_clause 0x1 215; GFX10-NEXT: global_load_dword v4, v2, s[2:3] 216; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] 217; GFX10-NEXT: s_waitcnt vmcnt(0) 218; GFX10-NEXT: v_mad_u64_u32 v[2:3], s2, v4, v0, 0 219; GFX10-NEXT: v_mov_b32_e32 v0, v3 220; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1] 221; GFX10-NEXT: v_mov_b32_e32 v3, v0 222; GFX10-NEXT: v_mov_b32_e32 v0, 0 223; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] 224; GFX10-NEXT: s_endpgm 225; 226; GFX11-LABEL: v_mul_i64_masked_src0_hi: 227; GFX11: ; %bb.0: 228; GFX11-NEXT: s_clause 0x1 229; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 230; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 231; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 232; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 233; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 234; GFX11-NEXT: s_waitcnt lgkmcnt(0) 235; GFX11-NEXT: s_clause 0x1 236; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] 237; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5] 238; GFX11-NEXT: s_waitcnt vmcnt(0) 239; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0 240; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 241; GFX11-NEXT: v_mov_b32_e32 v0, v3 242; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1] 243; GFX11-NEXT: v_mov_b32_e32 v0, 0 244; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] 245; GFX11-NEXT: s_endpgm 246 %tid = call i32 @llvm.amdgcn.workitem.id.x() 247 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 248 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 249 %a = load i64, ptr addrspace(1) %gep.a 250 %b = load i64, ptr addrspace(1) %gep.b 251 %a_and = and i64 %a, u0x00000000FFFFFFFF 252 %mul = mul i64 %a_and, %b 253 store i64 %mul, ptr addrspace(1) %out 254 ret void 255} 256 257; 64-bit multiplication where lower bytes of first argument were masked. 258define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 259; GFX10-LABEL: v_mul_i64_masked_src0_lo: 260; GFX10: ; %bb.0: 261; GFX10-NEXT: s_clause 0x1 262; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 263; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 264; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 265; GFX10-NEXT: s_waitcnt lgkmcnt(0) 266; GFX10-NEXT: s_clause 0x1 267; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 268; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 269; GFX10-NEXT: s_waitcnt vmcnt(1) 270; GFX10-NEXT: v_mov_b32_e32 v0, 0 271; GFX10-NEXT: s_waitcnt vmcnt(0) 272; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 273; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 274; GFX10-NEXT: s_endpgm 275; 276; GFX11-LABEL: v_mul_i64_masked_src0_lo: 277; GFX11: ; %bb.0: 278; GFX11-NEXT: s_clause 0x1 279; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 280; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 281; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 283; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 284; GFX11-NEXT: s_waitcnt lgkmcnt(0) 285; GFX11-NEXT: s_clause 0x1 286; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 287; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] 288; GFX11-NEXT: s_waitcnt vmcnt(1) 289; GFX11-NEXT: v_mov_b32_e32 v0, 0 290; GFX11-NEXT: s_waitcnt vmcnt(0) 291; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2 292; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 293; GFX11-NEXT: s_endpgm 294 %tid = call i32 @llvm.amdgcn.workitem.id.x() 295 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 296 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 297 %a = load i64, ptr addrspace(1) %gep.a 298 %b = load i64, ptr addrspace(1) %gep.b 299 %a_and = and i64 %a, u0xFFFFFFFF00000000 300 %mul = mul i64 %a_and, %b 301 store i64 %mul, ptr addrspace(1) %out 302 ret void 303} 304 305; 64-bit multiplication where the lower bytes of the second argument were masked. 306define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 307; GFX10-LABEL: v_mul_i64_masked_src1_lo: 308; GFX10: ; %bb.0: 309; GFX10-NEXT: s_clause 0x1 310; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 311; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 312; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 313; GFX10-NEXT: ; kill: killed $vgpr3 314; GFX10-NEXT: ; kill: killed $sgpr2_sgpr3 315; GFX10-NEXT: s_waitcnt lgkmcnt(0) 316; GFX10-NEXT: s_clause 0x1 317; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3] 318; GFX10-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7] 319; GFX10-NEXT: ; kill: killed $sgpr6_sgpr7 320; GFX10-NEXT: s_waitcnt vmcnt(0) 321; GFX10-NEXT: v_mul_lo_u32 v1, v0, v2 322; GFX10-NEXT: v_mov_b32_e32 v0, 0 323; GFX10-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1] 324; GFX10-NEXT: s_endpgm 325; 326; GFX11-LABEL: v_mul_i64_masked_src1_lo: 327; GFX11: ; %bb.0: 328; GFX11-NEXT: s_clause 0x1 329; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 330; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 331; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 332; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 333; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 334; GFX11-NEXT: s_waitcnt lgkmcnt(0) 335; GFX11-NEXT: s_clause 0x1 336; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 337; GFX11-NEXT: global_load_b64 v[1:2], v2, s[4:5] 338; GFX11-NEXT: s_waitcnt vmcnt(0) 339; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2 340; GFX11-NEXT: v_mov_b32_e32 v0, 0 341; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1] 342; GFX11-NEXT: s_endpgm 343 %tid = call i32 @llvm.amdgcn.workitem.id.x() 344 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 345 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 346 %a = load i64, ptr addrspace(1) %gep.a 347 %b = load i64, ptr addrspace(1) %gep.b 348 %b_and = and i64 %b, u0xFFFFFFFF00000000 349 %mul = mul i64 %a, %b_and 350 store i64 %mul, ptr addrspace(1) %out 351 ret void 352} 353 354; 64-bit multiplication where the entire first argument is masked. 355define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 356; GFX10-LABEL: v_mul_i64_masked_src0: 357; GFX10: ; %bb.0: 358; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 359; GFX10-NEXT: v_mov_b32_e32 v0, 0 360; GFX10-NEXT: v_mov_b32_e32 v1, 0 361; GFX10-NEXT: v_mov_b32_e32 v2, 0 362; GFX10-NEXT: s_waitcnt lgkmcnt(0) 363; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 364; GFX10-NEXT: s_endpgm 365; 366; GFX11-LABEL: v_mul_i64_masked_src0: 367; GFX11: ; %bb.0: 368; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 369; GFX11-NEXT: v_mov_b32_e32 v0, 0 370; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 371; GFX11-NEXT: s_waitcnt lgkmcnt(0) 372; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 373; GFX11-NEXT: s_endpgm 374 %tid = call i32 @llvm.amdgcn.workitem.id.x() 375 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 376 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 377 %a = load i64, ptr addrspace(1) %gep.a 378 %b = load i64, ptr addrspace(1) %gep.b 379 %a_and = and i64 %a, u0x0000000000000000 380 %mul = mul i64 %a_and, %b 381 store i64 %mul, ptr addrspace(1) %out 382 ret void 383} 384 385; 64-bit multiplication where the parts of the high and low bytes of the first argument are masked. 386define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 387; GFX10-LABEL: v_mul_i64_partially_masked_src0: 388; GFX10: ; %bb.0: 389; GFX10-NEXT: s_clause 0x1 390; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 391; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 392; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 393; GFX10-NEXT: s_waitcnt lgkmcnt(0) 394; GFX10-NEXT: s_clause 0x1 395; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 396; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 397; GFX10-NEXT: s_waitcnt vmcnt(1) 398; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0 399; GFX10-NEXT: s_waitcnt vmcnt(0) 400; GFX10-NEXT: v_mad_u64_u32 v[4:5], s2, v6, v2, 0 401; GFX10-NEXT: v_mov_b32_e32 v0, v5 402; GFX10-NEXT: v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1] 403; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1 404; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6] 405; GFX10-NEXT: v_mov_b32_e32 v5, v0 406; GFX10-NEXT: v_mov_b32_e32 v0, 0 407; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[0:1] 408; GFX10-NEXT: s_endpgm 409; 410; GFX11-LABEL: v_mul_i64_partially_masked_src0: 411; GFX11: ; %bb.0: 412; GFX11-NEXT: s_clause 0x1 413; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 414; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 415; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 416; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 417; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 418; GFX11-NEXT: s_waitcnt lgkmcnt(0) 419; GFX11-NEXT: s_clause 0x1 420; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 421; GFX11-NEXT: global_load_b64 v[2:3], v2, s[4:5] 422; GFX11-NEXT: s_waitcnt vmcnt(1) 423; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0 424; GFX11-NEXT: s_waitcnt vmcnt(0) 425; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 426; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0 427; GFX11-NEXT: v_mov_b32_e32 v0, v5 428; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 429; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1] 430; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1 431; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6] 432; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 433; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0 434; GFX11-NEXT: global_store_b64 v0, v[4:5], s[0:1] 435; GFX11-NEXT: s_endpgm 436 %tid = call i32 @llvm.amdgcn.workitem.id.x() 437 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 438 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 439 %a = load i64, ptr addrspace(1) %gep.a 440 %b = load i64, ptr addrspace(1) %gep.b 441 %a_and = and i64 %a, u0x0000F00FFFF00000 442 %mul = mul i64 %a_and, %b 443 store i64 %mul, ptr addrspace(1) %out 444 ret void 445} 446 447; 64-bit multiplication, where the first argument is masked before a branch 448define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 449; GFX10-LABEL: v_mul64_masked_before_branch: 450; GFX10: ; %bb.0: ; %entry 451; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 452; GFX10-NEXT: v_mov_b32_e32 v0, 0 453; GFX10-NEXT: v_mov_b32_e32 v1, 0 454; GFX10-NEXT: v_mov_b32_e32 v2, 0 455; GFX10-NEXT: s_waitcnt lgkmcnt(0) 456; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 457; GFX10-NEXT: s_endpgm 458; 459; GFX11-LABEL: v_mul64_masked_before_branch: 460; GFX11: ; %bb.0: ; %entry 461; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 462; GFX11-NEXT: v_mov_b32_e32 v0, 0 463; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 464; GFX11-NEXT: s_waitcnt lgkmcnt(0) 465; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 466; GFX11-NEXT: s_endpgm 467entry: 468 %tid = call i32 @llvm.amdgcn.workitem.id.x() 469 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 470 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 471 %a = load i64, ptr addrspace(1) %gep.a 472 %b = load i64, ptr addrspace(1) %gep.b 473 %a_and = and i64 %a, u0x0000000000000000 474 %0 = icmp eq i64 %b, 0 475 br i1 %0, label %if, label %else 476 477if: 478 %b_and = and i64 %b, u0xFFFFFFFF00000000 479 %1 = mul i64 %a_and, %b_and 480 br label %endif 481 482else: 483 %2 = mul i64 %a_and, %b 484 br label %endif 485 486endif: 487 %3 = phi i64 [%1, %if], [%2, %else] 488 store i64 %3, ptr addrspace(1) %out 489 ret void 490} 491 492; 64-bit multiplication with both arguments changed in different basic blocks. 493define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 494; GFX10-LABEL: v_mul64_masked_before_and_in_branch: 495; GFX10: ; %bb.0: ; %entry 496; GFX10-NEXT: s_clause 0x1 497; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 498; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 499; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 500; GFX10-NEXT: s_waitcnt lgkmcnt(0) 501; GFX10-NEXT: s_clause 0x1 502; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[2:3] 503; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[6:7] 504; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 505; GFX10-NEXT: s_waitcnt vmcnt(1) 506; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3] 507; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo 508; GFX10-NEXT: s_xor_b32 s2, exec_lo, s2 509; GFX10-NEXT: s_cbranch_execz .LBB10_2 510; GFX10-NEXT: ; %bb.1: ; %else 511; GFX10-NEXT: s_waitcnt vmcnt(0) 512; GFX10-NEXT: v_mad_u64_u32 v[0:1], s3, v2, v4, 0 513; GFX10-NEXT: v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2] 514; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3 515; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5 516; GFX10-NEXT: .LBB10_2: ; %Flow 517; GFX10-NEXT: s_andn2_saveexec_b32 s2, s2 518; GFX10-NEXT: s_cbranch_execz .LBB10_4 519; GFX10-NEXT: ; %bb.3: ; %if 520; GFX10-NEXT: s_waitcnt vmcnt(0) 521; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5 522; GFX10-NEXT: v_mov_b32_e32 v0, 0 523; GFX10-NEXT: .LBB10_4: ; %endif 524; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 525; GFX10-NEXT: v_mov_b32_e32 v2, 0 526; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 527; GFX10-NEXT: s_endpgm 528; 529; GFX11-LABEL: v_mul64_masked_before_and_in_branch: 530; GFX11: ; %bb.0: ; %entry 531; GFX11-NEXT: s_clause 0x1 532; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 533; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 534; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 535; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 536; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 537; GFX11-NEXT: s_waitcnt lgkmcnt(0) 538; GFX11-NEXT: s_clause 0x1 539; GFX11-NEXT: global_load_b64 v[2:3], v0, s[2:3] 540; GFX11-NEXT: global_load_b64 v[4:5], v0, s[4:5] 541; GFX11-NEXT: s_mov_b32 s2, exec_lo 542; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 543; GFX11-NEXT: s_waitcnt vmcnt(1) 544; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3] 545; GFX11-NEXT: s_xor_b32 s2, exec_lo, s2 546; GFX11-NEXT: s_cbranch_execz .LBB10_2 547; GFX11-NEXT: ; %bb.1: ; %else 548; GFX11-NEXT: s_waitcnt vmcnt(0) 549; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 550; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 551; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2] 552; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 553; GFX11-NEXT: v_mov_b32_e32 v1, v3 554; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 555; GFX11-NEXT: .LBB10_2: ; %Flow 556; GFX11-NEXT: s_and_not1_saveexec_b32 s2, s2 557; GFX11-NEXT: s_cbranch_execz .LBB10_4 558; GFX11-NEXT: ; %bb.3: ; %if 559; GFX11-NEXT: s_waitcnt vmcnt(0) 560; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5 561; GFX11-NEXT: v_mov_b32_e32 v0, 0 562; GFX11-NEXT: .LBB10_4: ; %endif 563; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 564; GFX11-NEXT: v_mov_b32_e32 v2, 0 565; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 566; GFX11-NEXT: s_endpgm 567entry: 568 %tid = call i32 @llvm.amdgcn.workitem.id.x() 569 %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid 570 %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid 571 %a = load i64, ptr addrspace(1) %gep.a 572 %b = load i64, ptr addrspace(1) %gep.b 573 %a_and = and i64 %a, u0x00000000FFFFFFFF 574 %0 = icmp ugt i64 %a, 0 575 br i1 %0, label %if, label %else 576 577if: 578 %b_and = and i64 %b, u0xFFFFFFFF00000000 579 %1 = mul i64 %a_and, %b_and 580 br label %endif 581 582else: 583 %2 = mul i64 %a_and, %b 584 br label %endif 585 586endif: 587 %3 = phi i64 [%1, %if], [%2, %else] 588 store i64 %3, ptr addrspace(1) %out 589 ret void 590} 591 592 593