1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s 8 9; mul24 and mad24 are affected 10 11define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 12; SI-LABEL: test_mul_v2i32: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s6, -1 17; SI-NEXT: s_mov_b32 s10, s6 18; SI-NEXT: s_mov_b32 s11, s7 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_mov_b32 s8, s2 21; SI-NEXT: s_mov_b32 s9, s3 22; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 23; SI-NEXT: s_mov_b32 s4, s0 24; SI-NEXT: s_mov_b32 s5, s1 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_mul_lo_u32 v1, v1, v3 27; SI-NEXT: v_mul_lo_u32 v0, v0, v2 28; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 29; SI-NEXT: s_endpgm 30; 31; VI-LABEL: test_mul_v2i32: 32; VI: ; %bb.0: ; %entry 33; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 34; VI-NEXT: s_mov_b32 s7, 0xf000 35; VI-NEXT: s_mov_b32 s6, -1 36; VI-NEXT: s_mov_b32 s10, s6 37; VI-NEXT: s_mov_b32 s11, s7 38; VI-NEXT: s_waitcnt lgkmcnt(0) 39; VI-NEXT: s_mov_b32 s8, s2 40; VI-NEXT: s_mov_b32 s9, s3 41; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 42; VI-NEXT: s_mov_b32 s4, s0 43; VI-NEXT: s_mov_b32 s5, s1 44; VI-NEXT: s_waitcnt vmcnt(0) 45; VI-NEXT: v_mul_lo_u32 v1, v1, v3 46; VI-NEXT: v_mul_lo_u32 v0, v0, v2 47; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 48; VI-NEXT: s_endpgm 49; 50; GFX9-LABEL: test_mul_v2i32: 51; GFX9: ; %bb.0: ; %entry 52; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 53; GFX9-NEXT: s_mov_b32 s7, 0xf000 54; GFX9-NEXT: s_mov_b32 s6, -1 55; GFX9-NEXT: s_mov_b32 s10, s6 56; GFX9-NEXT: s_mov_b32 s11, s7 57; GFX9-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-NEXT: s_mov_b32 s8, s2 59; GFX9-NEXT: s_mov_b32 s9, s3 60; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 61; GFX9-NEXT: s_mov_b32 s4, s0 62; GFX9-NEXT: s_mov_b32 s5, s1 63; GFX9-NEXT: s_waitcnt vmcnt(0) 64; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 65; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 66; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 67; GFX9-NEXT: s_endpgm 68; 69; GFX10-LABEL: test_mul_v2i32: 70; GFX10: ; %bb.0: ; %entry 71; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 72; GFX10-NEXT: s_mov_b32 s6, -1 73; GFX10-NEXT: s_mov_b32 s7, 0x31016000 74; GFX10-NEXT: s_mov_b32 s10, s6 75; GFX10-NEXT: s_mov_b32 s11, s7 76; GFX10-NEXT: s_waitcnt lgkmcnt(0) 77; GFX10-NEXT: s_mov_b32 s8, s2 78; GFX10-NEXT: s_mov_b32 s9, s3 79; GFX10-NEXT: s_mov_b32 s4, s0 80; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 81; GFX10-NEXT: s_mov_b32 s5, s1 82; GFX10-NEXT: s_waitcnt vmcnt(0) 83; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 84; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 85; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 86; GFX10-NEXT: s_endpgm 87; 88; GFX11-LABEL: test_mul_v2i32: 89; GFX11: ; %bb.0: ; %entry 90; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 91; GFX11-NEXT: s_mov_b32 s6, -1 92; GFX11-NEXT: s_mov_b32 s7, 0x31016000 93; GFX11-NEXT: s_mov_b32 s10, s6 94; GFX11-NEXT: s_mov_b32 s11, s7 95; GFX11-NEXT: s_waitcnt lgkmcnt(0) 96; GFX11-NEXT: s_mov_b32 s8, s2 97; GFX11-NEXT: s_mov_b32 s9, s3 98; GFX11-NEXT: s_mov_b32 s4, s0 99; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 100; GFX11-NEXT: s_mov_b32 s5, s1 101; GFX11-NEXT: s_waitcnt vmcnt(0) 102; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 103; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 104; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 105; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 106; GFX11-NEXT: s_endpgm 107; 108; EG-LABEL: test_mul_v2i32: 109; EG: ; %bb.0: ; %entry 110; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 111; EG-NEXT: TEX 0 @6 112; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 113; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 114; EG-NEXT: CF_END 115; EG-NEXT: PAD 116; EG-NEXT: Fetch clause starting at 6: 117; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 118; EG-NEXT: ALU clause starting at 8: 119; EG-NEXT: MOV * T0.X, KC0[2].Z, 120; EG-NEXT: ALU clause starting at 9: 121; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W, 122; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 123; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z, 124; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 125entry: 126 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 127 %a = load <2 x i32>, ptr addrspace(1) %in 128 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 129 %result = mul <2 x i32> %a, %b 130 store <2 x i32> %result, ptr addrspace(1) %out 131 ret void 132} 133 134define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 135; SI-LABEL: v_mul_v4i32: 136; SI: ; %bb.0: ; %entry 137; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 138; SI-NEXT: s_mov_b32 s7, 0xf000 139; SI-NEXT: s_mov_b32 s6, -1 140; SI-NEXT: s_mov_b32 s10, s6 141; SI-NEXT: s_mov_b32 s11, s7 142; SI-NEXT: s_waitcnt lgkmcnt(0) 143; SI-NEXT: s_mov_b32 s8, s2 144; SI-NEXT: s_mov_b32 s9, s3 145; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 146; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 147; SI-NEXT: s_mov_b32 s4, s0 148; SI-NEXT: s_mov_b32 s5, s1 149; SI-NEXT: s_waitcnt vmcnt(0) 150; SI-NEXT: v_mul_lo_u32 v3, v3, v7 151; SI-NEXT: v_mul_lo_u32 v2, v2, v6 152; SI-NEXT: v_mul_lo_u32 v1, v1, v5 153; SI-NEXT: v_mul_lo_u32 v0, v0, v4 154; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 155; SI-NEXT: s_endpgm 156; 157; VI-LABEL: v_mul_v4i32: 158; VI: ; %bb.0: ; %entry 159; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 160; VI-NEXT: s_mov_b32 s7, 0xf000 161; VI-NEXT: s_mov_b32 s6, -1 162; VI-NEXT: s_mov_b32 s10, s6 163; VI-NEXT: s_mov_b32 s11, s7 164; VI-NEXT: s_waitcnt lgkmcnt(0) 165; VI-NEXT: s_mov_b32 s8, s2 166; VI-NEXT: s_mov_b32 s9, s3 167; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 168; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 169; VI-NEXT: s_mov_b32 s4, s0 170; VI-NEXT: s_mov_b32 s5, s1 171; VI-NEXT: s_waitcnt vmcnt(0) 172; VI-NEXT: v_mul_lo_u32 v3, v3, v7 173; VI-NEXT: v_mul_lo_u32 v2, v2, v6 174; VI-NEXT: v_mul_lo_u32 v1, v1, v5 175; VI-NEXT: v_mul_lo_u32 v0, v0, v4 176; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 177; VI-NEXT: s_endpgm 178; 179; GFX9-LABEL: v_mul_v4i32: 180; GFX9: ; %bb.0: ; %entry 181; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 182; GFX9-NEXT: s_mov_b32 s7, 0xf000 183; GFX9-NEXT: s_mov_b32 s6, -1 184; GFX9-NEXT: s_mov_b32 s10, s6 185; GFX9-NEXT: s_mov_b32 s11, s7 186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 187; GFX9-NEXT: s_mov_b32 s8, s2 188; GFX9-NEXT: s_mov_b32 s9, s3 189; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 190; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 191; GFX9-NEXT: s_mov_b32 s4, s0 192; GFX9-NEXT: s_mov_b32 s5, s1 193; GFX9-NEXT: s_waitcnt vmcnt(0) 194; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 195; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 196; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 197; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 198; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 199; GFX9-NEXT: s_endpgm 200; 201; GFX10-LABEL: v_mul_v4i32: 202; GFX10: ; %bb.0: ; %entry 203; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 204; GFX10-NEXT: s_mov_b32 s6, -1 205; GFX10-NEXT: s_mov_b32 s7, 0x31016000 206; GFX10-NEXT: s_mov_b32 s10, s6 207; GFX10-NEXT: s_mov_b32 s11, s7 208; GFX10-NEXT: s_waitcnt lgkmcnt(0) 209; GFX10-NEXT: s_mov_b32 s8, s2 210; GFX10-NEXT: s_mov_b32 s9, s3 211; GFX10-NEXT: s_clause 0x1 212; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 213; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 214; GFX10-NEXT: s_mov_b32 s4, s0 215; GFX10-NEXT: s_mov_b32 s5, s1 216; GFX10-NEXT: s_waitcnt vmcnt(0) 217; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 218; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 219; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 220; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 221; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 222; GFX10-NEXT: s_endpgm 223; 224; GFX11-LABEL: v_mul_v4i32: 225; GFX11: ; %bb.0: ; %entry 226; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 227; GFX11-NEXT: s_mov_b32 s6, -1 228; GFX11-NEXT: s_mov_b32 s7, 0x31016000 229; GFX11-NEXT: s_mov_b32 s10, s6 230; GFX11-NEXT: s_mov_b32 s11, s7 231; GFX11-NEXT: s_waitcnt lgkmcnt(0) 232; GFX11-NEXT: s_mov_b32 s8, s2 233; GFX11-NEXT: s_mov_b32 s9, s3 234; GFX11-NEXT: s_clause 0x1 235; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 236; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16 237; GFX11-NEXT: s_mov_b32 s4, s0 238; GFX11-NEXT: s_mov_b32 s5, s1 239; GFX11-NEXT: s_waitcnt vmcnt(0) 240; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7 241; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6 242; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 243; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 244; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 245; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 246; GFX11-NEXT: s_endpgm 247; 248; EG-LABEL: v_mul_v4i32: 249; EG: ; %bb.0: ; %entry 250; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 251; EG-NEXT: TEX 1 @6 252; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 253; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 254; EG-NEXT: CF_END 255; EG-NEXT: PAD 256; EG-NEXT: Fetch clause starting at 6: 257; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 258; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 259; EG-NEXT: ALU clause starting at 10: 260; EG-NEXT: MOV * T0.X, KC0[2].Z, 261; EG-NEXT: ALU clause starting at 11: 262; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W, 263; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z, 264; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y, 265; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 266; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 267; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 268entry: 269 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 270 %a = load <4 x i32>, ptr addrspace(1) %in 271 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 272 %result = mul <4 x i32> %a, %b 273 store <4 x i32> %result, ptr addrspace(1) %out 274 ret void 275} 276 277define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { 278; SI-LABEL: s_trunc_i64_mul_to_i32: 279; SI: ; %bb.0: ; %entry 280; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 281; SI-NEXT: s_waitcnt lgkmcnt(0) 282; SI-NEXT: s_load_dword s7, s[0:1], 0xd 283; SI-NEXT: s_mov_b32 s3, 0xf000 284; SI-NEXT: s_mov_b32 s2, -1 285; SI-NEXT: s_mov_b32 s0, s4 286; SI-NEXT: s_waitcnt lgkmcnt(0) 287; SI-NEXT: s_mul_i32 s4, s7, s6 288; SI-NEXT: s_mov_b32 s1, s5 289; SI-NEXT: v_mov_b32_e32 v0, s4 290; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 291; SI-NEXT: s_endpgm 292; 293; VI-LABEL: s_trunc_i64_mul_to_i32: 294; VI: ; %bb.0: ; %entry 295; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 296; VI-NEXT: s_waitcnt lgkmcnt(0) 297; VI-NEXT: s_load_dword s7, s[0:1], 0x34 298; VI-NEXT: s_mov_b32 s3, 0xf000 299; VI-NEXT: s_mov_b32 s2, -1 300; VI-NEXT: s_mov_b32 s0, s4 301; VI-NEXT: s_waitcnt lgkmcnt(0) 302; VI-NEXT: s_mul_i32 s4, s7, s6 303; VI-NEXT: s_mov_b32 s1, s5 304; VI-NEXT: v_mov_b32_e32 v0, s4 305; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 306; VI-NEXT: s_endpgm 307; 308; GFX9-LABEL: s_trunc_i64_mul_to_i32: 309; GFX9: ; %bb.0: ; %entry 310; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 311; GFX9-NEXT: s_waitcnt lgkmcnt(0) 312; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 313; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 314; GFX9-NEXT: s_mov_b32 s3, 0xf000 315; GFX9-NEXT: s_mov_b32 s2, -1 316; GFX9-NEXT: s_mov_b32 s0, s4 317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 318; GFX9-NEXT: s_mul_i32 s4, s7, s6 319; GFX9-NEXT: s_mov_b32 s1, s5 320; GFX9-NEXT: v_mov_b32_e32 v0, s4 321; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 322; GFX9-NEXT: s_endpgm 323; 324; GFX10-LABEL: s_trunc_i64_mul_to_i32: 325; GFX10: ; %bb.0: ; %entry 326; GFX10-NEXT: s_clause 0x1 327; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 328; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 329; GFX10-NEXT: s_waitcnt lgkmcnt(0) 330; GFX10-NEXT: s_mov_b32 s7, 0x31016000 331; GFX10-NEXT: s_mul_i32 s0, s2, s6 332; GFX10-NEXT: s_mov_b32 s6, -1 333; GFX10-NEXT: v_mov_b32_e32 v0, s0 334; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 335; GFX10-NEXT: s_endpgm 336; 337; GFX11-LABEL: s_trunc_i64_mul_to_i32: 338; GFX11: ; %bb.0: ; %entry 339; GFX11-NEXT: s_clause 0x1 340; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 341; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 342; GFX11-NEXT: s_waitcnt lgkmcnt(0) 343; GFX11-NEXT: s_mov_b32 s7, 0x31016000 344; GFX11-NEXT: s_mul_i32 s0, s0, s6 345; GFX11-NEXT: s_mov_b32 s6, -1 346; GFX11-NEXT: v_mov_b32_e32 v0, s0 347; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 348; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 349; GFX11-NEXT: s_endpgm 350; 351; EG-LABEL: s_trunc_i64_mul_to_i32: 352; EG: ; %bb.0: ; %entry 353; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 354; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 355; EG-NEXT: CF_END 356; EG-NEXT: PAD 357; EG-NEXT: ALU clause starting at 4: 358; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 359; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 360; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W, 361entry: 362 %mul = mul i64 %b, %a 363 %trunc = trunc i64 %mul to i32 364 store i32 %trunc, ptr addrspace(1) %out, align 8 365 ret void 366} 367 368define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 369; SI-LABEL: v_trunc_i64_mul_to_i32: 370; SI: ; %bb.0: ; %entry 371; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 372; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 373; SI-NEXT: s_mov_b32 s3, 0xf000 374; SI-NEXT: s_mov_b32 s2, -1 375; SI-NEXT: s_mov_b32 s14, s2 376; SI-NEXT: s_waitcnt lgkmcnt(0) 377; SI-NEXT: s_mov_b32 s12, s6 378; SI-NEXT: s_mov_b32 s13, s7 379; SI-NEXT: s_mov_b32 s15, s3 380; SI-NEXT: s_mov_b32 s10, s2 381; SI-NEXT: s_mov_b32 s11, s3 382; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 383; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 384; SI-NEXT: s_mov_b32 s0, s4 385; SI-NEXT: s_mov_b32 s1, s5 386; SI-NEXT: s_waitcnt vmcnt(0) 387; SI-NEXT: v_mul_lo_u32 v0, v1, v0 388; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; SI-NEXT: s_endpgm 390; 391; VI-LABEL: v_trunc_i64_mul_to_i32: 392; VI: ; %bb.0: ; %entry 393; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 394; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 395; VI-NEXT: s_mov_b32 s3, 0xf000 396; VI-NEXT: s_mov_b32 s2, -1 397; VI-NEXT: s_mov_b32 s14, s2 398; VI-NEXT: s_waitcnt lgkmcnt(0) 399; VI-NEXT: s_mov_b32 s12, s6 400; VI-NEXT: s_mov_b32 s13, s7 401; VI-NEXT: s_mov_b32 s15, s3 402; VI-NEXT: s_mov_b32 s10, s2 403; VI-NEXT: s_mov_b32 s11, s3 404; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 405; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 406; VI-NEXT: s_mov_b32 s0, s4 407; VI-NEXT: s_mov_b32 s1, s5 408; VI-NEXT: s_waitcnt vmcnt(0) 409; VI-NEXT: v_mul_lo_u32 v0, v1, v0 410; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 411; VI-NEXT: s_endpgm 412; 413; GFX9-LABEL: v_trunc_i64_mul_to_i32: 414; GFX9: ; %bb.0: ; %entry 415; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 416; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 417; GFX9-NEXT: s_mov_b32 s3, 0xf000 418; GFX9-NEXT: s_mov_b32 s2, -1 419; GFX9-NEXT: s_mov_b32 s14, s2 420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 421; GFX9-NEXT: s_mov_b32 s12, s6 422; GFX9-NEXT: s_mov_b32 s13, s7 423; GFX9-NEXT: s_mov_b32 s15, s3 424; GFX9-NEXT: s_mov_b32 s10, s2 425; GFX9-NEXT: s_mov_b32 s11, s3 426; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 427; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 428; GFX9-NEXT: s_mov_b32 s0, s4 429; GFX9-NEXT: s_mov_b32 s1, s5 430; GFX9-NEXT: s_waitcnt vmcnt(0) 431; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 432; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 433; GFX9-NEXT: s_endpgm 434; 435; GFX10-LABEL: v_trunc_i64_mul_to_i32: 436; GFX10: ; %bb.0: ; %entry 437; GFX10-NEXT: s_clause 0x1 438; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 439; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 440; GFX10-NEXT: s_mov_b32 s2, -1 441; GFX10-NEXT: s_mov_b32 s3, 0x31016000 442; GFX10-NEXT: s_mov_b32 s14, s2 443; GFX10-NEXT: s_mov_b32 s15, s3 444; GFX10-NEXT: s_mov_b32 s10, s2 445; GFX10-NEXT: s_mov_b32 s11, s3 446; GFX10-NEXT: s_waitcnt lgkmcnt(0) 447; GFX10-NEXT: s_mov_b32 s12, s6 448; GFX10-NEXT: s_mov_b32 s13, s7 449; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 450; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 451; GFX10-NEXT: s_mov_b32 s0, s4 452; GFX10-NEXT: s_mov_b32 s1, s5 453; GFX10-NEXT: s_waitcnt vmcnt(0) 454; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 455; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 456; GFX10-NEXT: s_endpgm 457; 458; GFX11-LABEL: v_trunc_i64_mul_to_i32: 459; GFX11: ; %bb.0: ; %entry 460; GFX11-NEXT: s_clause 0x1 461; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 462; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 463; GFX11-NEXT: s_mov_b32 s10, -1 464; GFX11-NEXT: s_mov_b32 s11, 0x31016000 465; GFX11-NEXT: s_mov_b32 s14, s10 466; GFX11-NEXT: s_mov_b32 s15, s11 467; GFX11-NEXT: s_mov_b32 s2, s10 468; GFX11-NEXT: s_mov_b32 s3, s11 469; GFX11-NEXT: s_waitcnt lgkmcnt(0) 470; GFX11-NEXT: s_mov_b32 s12, s6 471; GFX11-NEXT: s_mov_b32 s13, s7 472; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 473; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 474; GFX11-NEXT: s_mov_b32 s8, s4 475; GFX11-NEXT: s_mov_b32 s9, s5 476; GFX11-NEXT: s_waitcnt vmcnt(0) 477; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 478; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 479; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 480; GFX11-NEXT: s_endpgm 481; 482; EG-LABEL: v_trunc_i64_mul_to_i32: 483; EG: ; %bb.0: ; %entry 484; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 485; EG-NEXT: TEX 1 @6 486; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 487; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 488; EG-NEXT: CF_END 489; EG-NEXT: PAD 490; EG-NEXT: Fetch clause starting at 6: 491; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 492; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 493; EG-NEXT: ALU clause starting at 10: 494; EG-NEXT: MOV T0.X, KC0[2].Z, 495; EG-NEXT: MOV * T1.X, KC0[2].W, 496; EG-NEXT: ALU clause starting at 12: 497; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 498; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 499; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 500entry: 501 %a = load i64, ptr addrspace(1) %aptr, align 8 502 %b = load i64, ptr addrspace(1) %bptr, align 8 503 %mul = mul i64 %b, %a 504 %trunc = trunc i64 %mul to i32 505 store i32 %trunc, ptr addrspace(1) %out, align 8 506 ret void 507} 508 509; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top 510; 32-bits of both arguments are sign bits. 511 512define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { 513; SI-LABEL: mul64_sext_c: 514; SI: ; %bb.0: ; %entry 515; SI-NEXT: s_load_dword s4, s[0:1], 0xb 516; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 517; SI-NEXT: v_mov_b32_e32 v0, 0x50 518; SI-NEXT: s_mov_b32 s3, 0xf000 519; SI-NEXT: s_mov_b32 s2, -1 520; SI-NEXT: s_waitcnt lgkmcnt(0) 521; SI-NEXT: v_mul_hi_i32 v1, s4, v0 522; SI-NEXT: s_mulk_i32 s4, 0x50 523; SI-NEXT: v_mov_b32_e32 v0, s4 524; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 525; SI-NEXT: s_endpgm 526; 527; VI-LABEL: mul64_sext_c: 528; VI: ; %bb.0: ; %entry 529; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 530; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 531; VI-NEXT: v_mov_b32_e32 v0, 0x50 532; VI-NEXT: s_waitcnt lgkmcnt(0) 533; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 534; VI-NEXT: s_mov_b32 s3, 0xf000 535; VI-NEXT: s_mov_b32 s2, -1 536; VI-NEXT: s_nop 2 537; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 538; VI-NEXT: s_endpgm 539; 540; GFX9-LABEL: mul64_sext_c: 541; GFX9: ; %bb.0: ; %entry 542; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 543; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 544; GFX9-NEXT: s_mov_b32 s7, 0xf000 545; GFX9-NEXT: s_mov_b32 s6, -1 546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 547; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 548; GFX9-NEXT: s_mulk_i32 s2, 0x50 549; GFX9-NEXT: v_mov_b32_e32 v0, s2 550; GFX9-NEXT: v_mov_b32_e32 v1, s0 551; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 552; GFX9-NEXT: s_endpgm 553; 554; GFX10-LABEL: mul64_sext_c: 555; GFX10: ; %bb.0: ; %entry 556; GFX10-NEXT: s_clause 0x1 557; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 558; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 559; GFX10-NEXT: s_mov_b32 s7, 0x31016000 560; GFX10-NEXT: s_mov_b32 s6, -1 561; GFX10-NEXT: s_waitcnt lgkmcnt(0) 562; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 563; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 564; GFX10-NEXT: v_mov_b32_e32 v0, s0 565; GFX10-NEXT: v_mov_b32_e32 v1, s1 566; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 567; GFX10-NEXT: s_endpgm 568; 569; GFX11-LABEL: mul64_sext_c: 570; GFX11: ; %bb.0: ; %entry 571; GFX11-NEXT: s_clause 0x1 572; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c 573; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 574; GFX11-NEXT: s_waitcnt lgkmcnt(0) 575; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 576; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 577; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 578; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 579; GFX11-NEXT: s_mov_b32 s3, 0x31016000 580; GFX11-NEXT: s_mov_b32 s2, -1 581; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 582; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 583; GFX11-NEXT: s_endpgm 584; 585; EG-LABEL: mul64_sext_c: 586; EG: ; %bb.0: ; %entry 587; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 588; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 589; EG-NEXT: CF_END 590; EG-NEXT: PAD 591; EG-NEXT: ALU clause starting at 4: 592; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x, 593; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 594; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 595; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 596; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 597entry: 598 %0 = sext i32 %in to i64 599 %1 = mul i64 %0, 80 600 store i64 %1, ptr addrspace(1) %out 601 ret void 602} 603 604define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 605; SI-LABEL: v_mul64_sext_c: 606; SI: ; %bb.0: ; %entry 607; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 608; SI-NEXT: s_mov_b32 s7, 0xf000 609; SI-NEXT: s_mov_b32 s6, -1 610; SI-NEXT: s_mov_b32 s10, s6 611; SI-NEXT: s_mov_b32 s11, s7 612; SI-NEXT: s_waitcnt lgkmcnt(0) 613; SI-NEXT: s_mov_b32 s8, s2 614; SI-NEXT: s_mov_b32 s9, s3 615; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 616; SI-NEXT: s_movk_i32 s2, 0x50 617; SI-NEXT: s_mov_b32 s4, s0 618; SI-NEXT: s_mov_b32 s5, s1 619; SI-NEXT: s_waitcnt vmcnt(0) 620; SI-NEXT: v_mul_hi_i32 v1, v0, s2 621; SI-NEXT: v_mul_lo_u32 v0, v0, s2 622; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 623; SI-NEXT: s_endpgm 624; 625; VI-LABEL: v_mul64_sext_c: 626; VI: ; %bb.0: ; %entry 627; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 628; VI-NEXT: s_mov_b32 s7, 0xf000 629; VI-NEXT: s_mov_b32 s6, -1 630; VI-NEXT: s_mov_b32 s10, s6 631; VI-NEXT: s_mov_b32 s11, s7 632; VI-NEXT: s_waitcnt lgkmcnt(0) 633; VI-NEXT: s_mov_b32 s8, s2 634; VI-NEXT: s_mov_b32 s9, s3 635; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 636; VI-NEXT: s_movk_i32 s2, 0x50 637; VI-NEXT: s_mov_b32 s4, s0 638; VI-NEXT: s_mov_b32 s5, s1 639; VI-NEXT: s_waitcnt vmcnt(0) 640; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0 641; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 642; VI-NEXT: s_endpgm 643; 644; GFX9-LABEL: v_mul64_sext_c: 645; GFX9: ; %bb.0: ; %entry 646; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 647; GFX9-NEXT: s_mov_b32 s7, 0xf000 648; GFX9-NEXT: s_mov_b32 s6, -1 649; GFX9-NEXT: s_mov_b32 s10, s6 650; GFX9-NEXT: s_mov_b32 s11, s7 651; GFX9-NEXT: s_waitcnt lgkmcnt(0) 652; GFX9-NEXT: s_mov_b32 s8, s2 653; GFX9-NEXT: s_mov_b32 s9, s3 654; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 655; GFX9-NEXT: s_movk_i32 s2, 0x50 656; GFX9-NEXT: s_mov_b32 s4, s0 657; GFX9-NEXT: s_mov_b32 s5, s1 658; GFX9-NEXT: s_waitcnt vmcnt(0) 659; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 660; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 661; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 662; GFX9-NEXT: s_endpgm 663; 664; GFX10-LABEL: v_mul64_sext_c: 665; GFX10: ; %bb.0: ; %entry 666; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 667; GFX10-NEXT: s_mov_b32 s6, -1 668; GFX10-NEXT: s_mov_b32 s7, 0x31016000 669; GFX10-NEXT: s_mov_b32 s10, s6 670; GFX10-NEXT: s_mov_b32 s11, s7 671; GFX10-NEXT: s_waitcnt lgkmcnt(0) 672; GFX10-NEXT: s_mov_b32 s8, s2 673; GFX10-NEXT: s_mov_b32 s9, s3 674; GFX10-NEXT: s_mov_b32 s4, s0 675; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 676; GFX10-NEXT: s_mov_b32 s5, s1 677; GFX10-NEXT: s_waitcnt vmcnt(0) 678; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 679; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 680; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 681; GFX10-NEXT: s_endpgm 682; 683; GFX11-LABEL: v_mul64_sext_c: 684; GFX11: ; %bb.0: ; %entry 685; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 686; GFX11-NEXT: s_mov_b32 s6, -1 687; GFX11-NEXT: s_mov_b32 s7, 0x31016000 688; GFX11-NEXT: s_mov_b32 s10, s6 689; GFX11-NEXT: s_mov_b32 s11, s7 690; GFX11-NEXT: s_waitcnt lgkmcnt(0) 691; GFX11-NEXT: s_mov_b32 s8, s2 692; GFX11-NEXT: s_mov_b32 s9, s3 693; GFX11-NEXT: s_mov_b32 s4, s0 694; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 695; GFX11-NEXT: s_mov_b32 s5, s1 696; GFX11-NEXT: s_waitcnt vmcnt(0) 697; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 698; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 699; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 700; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 701; GFX11-NEXT: s_endpgm 702; 703; EG-LABEL: v_mul64_sext_c: 704; EG: ; %bb.0: ; %entry 705; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 706; EG-NEXT: TEX 0 @6 707; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 708; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 709; EG-NEXT: CF_END 710; EG-NEXT: PAD 711; EG-NEXT: Fetch clause starting at 6: 712; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 713; EG-NEXT: ALU clause starting at 8: 714; EG-NEXT: MOV * T0.X, KC0[2].Z, 715; EG-NEXT: ALU clause starting at 9: 716; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 717; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 718; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 719; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 720; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 721entry: 722 %val = load i32, ptr addrspace(1) %in, align 4 723 %ext = sext i32 %val to i64 724 %mul = mul i64 %ext, 80 725 store i64 %mul, ptr addrspace(1) %out, align 8 726 ret void 727} 728 729define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { 730; SI-LABEL: v_mul64_sext_inline_imm: 731; SI: ; %bb.0: ; %entry 732; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 733; SI-NEXT: s_mov_b32 s7, 0xf000 734; SI-NEXT: s_mov_b32 s6, -1 735; SI-NEXT: s_mov_b32 s10, s6 736; SI-NEXT: s_mov_b32 s11, s7 737; SI-NEXT: s_waitcnt lgkmcnt(0) 738; SI-NEXT: s_mov_b32 s8, s2 739; SI-NEXT: s_mov_b32 s9, s3 740; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 741; SI-NEXT: s_mov_b32 s4, s0 742; SI-NEXT: s_mov_b32 s5, s1 743; SI-NEXT: s_waitcnt vmcnt(0) 744; SI-NEXT: v_mul_hi_i32 v1, v0, 9 745; SI-NEXT: v_mul_lo_u32 v0, v0, 9 746; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 747; SI-NEXT: s_endpgm 748; 749; VI-LABEL: v_mul64_sext_inline_imm: 750; VI: ; %bb.0: ; %entry 751; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 752; VI-NEXT: s_mov_b32 s7, 0xf000 753; VI-NEXT: s_mov_b32 s6, -1 754; VI-NEXT: s_mov_b32 s10, s6 755; VI-NEXT: s_mov_b32 s11, s7 756; VI-NEXT: s_waitcnt lgkmcnt(0) 757; VI-NEXT: s_mov_b32 s8, s2 758; VI-NEXT: s_mov_b32 s9, s3 759; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 760; VI-NEXT: s_mov_b32 s4, s0 761; VI-NEXT: s_mov_b32 s5, s1 762; VI-NEXT: s_waitcnt vmcnt(0) 763; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0 764; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 765; VI-NEXT: s_endpgm 766; 767; GFX9-LABEL: v_mul64_sext_inline_imm: 768; GFX9: ; %bb.0: ; %entry 769; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 770; GFX9-NEXT: s_mov_b32 s7, 0xf000 771; GFX9-NEXT: s_mov_b32 s6, -1 772; GFX9-NEXT: s_mov_b32 s10, s6 773; GFX9-NEXT: s_mov_b32 s11, s7 774; GFX9-NEXT: s_waitcnt lgkmcnt(0) 775; GFX9-NEXT: s_mov_b32 s8, s2 776; GFX9-NEXT: s_mov_b32 s9, s3 777; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 778; GFX9-NEXT: s_mov_b32 s4, s0 779; GFX9-NEXT: s_mov_b32 s5, s1 780; GFX9-NEXT: s_waitcnt vmcnt(0) 781; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 782; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 783; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 784; GFX9-NEXT: s_endpgm 785; 786; GFX10-LABEL: v_mul64_sext_inline_imm: 787; GFX10: ; %bb.0: ; %entry 788; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 789; GFX10-NEXT: s_mov_b32 s6, -1 790; GFX10-NEXT: s_mov_b32 s7, 0x31016000 791; GFX10-NEXT: s_mov_b32 s10, s6 792; GFX10-NEXT: s_mov_b32 s11, s7 793; GFX10-NEXT: s_waitcnt lgkmcnt(0) 794; GFX10-NEXT: s_mov_b32 s8, s2 795; GFX10-NEXT: s_mov_b32 s9, s3 796; GFX10-NEXT: s_mov_b32 s4, s0 797; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 798; GFX10-NEXT: s_mov_b32 s5, s1 799; GFX10-NEXT: s_waitcnt vmcnt(0) 800; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 801; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 802; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 803; GFX10-NEXT: s_endpgm 804; 805; GFX11-LABEL: v_mul64_sext_inline_imm: 806; GFX11: ; %bb.0: ; %entry 807; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 808; GFX11-NEXT: s_mov_b32 s6, -1 809; GFX11-NEXT: s_mov_b32 s7, 0x31016000 810; GFX11-NEXT: s_mov_b32 s10, s6 811; GFX11-NEXT: s_mov_b32 s11, s7 812; GFX11-NEXT: s_waitcnt lgkmcnt(0) 813; GFX11-NEXT: s_mov_b32 s8, s2 814; GFX11-NEXT: s_mov_b32 s9, s3 815; GFX11-NEXT: s_mov_b32 s4, s0 816; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 817; GFX11-NEXT: s_mov_b32 s5, s1 818; GFX11-NEXT: s_waitcnt vmcnt(0) 819; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 820; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 821; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 822; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 823; GFX11-NEXT: s_endpgm 824; 825; EG-LABEL: v_mul64_sext_inline_imm: 826; EG: ; %bb.0: ; %entry 827; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 828; EG-NEXT: TEX 0 @6 829; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 830; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 831; EG-NEXT: CF_END 832; EG-NEXT: PAD 833; EG-NEXT: Fetch clause starting at 6: 834; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 835; EG-NEXT: ALU clause starting at 8: 836; EG-NEXT: MOV * T0.X, KC0[2].Z, 837; EG-NEXT: ALU clause starting at 9: 838; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 839; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 840; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 841; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 842; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) 843entry: 844 %val = load i32, ptr addrspace(1) %in, align 4 845 %ext = sext i32 %val to i64 846 %mul = mul i64 %ext, 9 847 store i64 %mul, ptr addrspace(1) %out, align 8 848 ret void 849} 850 851define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { 852; SI-LABEL: s_mul_i32: 853; SI: ; %bb.0: ; %entry 854; SI-NEXT: s_load_dword s4, s[0:1], 0x13 855; SI-NEXT: s_load_dword s5, s[0:1], 0x1c 856; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 857; SI-NEXT: s_mov_b32 s3, 0xf000 858; SI-NEXT: s_mov_b32 s2, -1 859; SI-NEXT: s_waitcnt lgkmcnt(0) 860; SI-NEXT: s_mul_i32 s4, s4, s5 861; SI-NEXT: v_mov_b32_e32 v0, s4 862; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 863; SI-NEXT: s_endpgm 864; 865; VI-LABEL: s_mul_i32: 866; VI: ; %bb.0: ; %entry 867; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 868; VI-NEXT: s_load_dword s5, s[0:1], 0x70 869; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 870; VI-NEXT: s_mov_b32 s3, 0xf000 871; VI-NEXT: s_mov_b32 s2, -1 872; VI-NEXT: s_waitcnt lgkmcnt(0) 873; VI-NEXT: s_mul_i32 s4, s4, s5 874; VI-NEXT: v_mov_b32_e32 v0, s4 875; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 876; VI-NEXT: s_endpgm 877; 878; GFX9-LABEL: s_mul_i32: 879; GFX9: ; %bb.0: ; %entry 880; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c 881; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 882; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 883; GFX9-NEXT: s_mov_b32 s7, 0xf000 884; GFX9-NEXT: s_mov_b32 s6, -1 885; GFX9-NEXT: s_waitcnt lgkmcnt(0) 886; GFX9-NEXT: s_mul_i32 s0, s2, s3 887; GFX9-NEXT: v_mov_b32_e32 v0, s0 888; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 889; GFX9-NEXT: s_endpgm 890; 891; GFX10-LABEL: s_mul_i32: 892; GFX10: ; %bb.0: ; %entry 893; GFX10-NEXT: s_clause 0x2 894; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 895; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 896; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 897; GFX10-NEXT: s_mov_b32 s7, 0x31016000 898; GFX10-NEXT: s_mov_b32 s6, -1 899; GFX10-NEXT: s_waitcnt lgkmcnt(0) 900; GFX10-NEXT: s_mul_i32 s0, s2, s3 901; GFX10-NEXT: v_mov_b32_e32 v0, s0 902; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 903; GFX10-NEXT: s_endpgm 904; 905; GFX11-LABEL: s_mul_i32: 906; GFX11: ; %bb.0: ; %entry 907; GFX11-NEXT: s_clause 0x2 908; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c 909; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 910; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 911; GFX11-NEXT: s_waitcnt lgkmcnt(0) 912; GFX11-NEXT: s_mul_i32 s2, s2, s3 913; GFX11-NEXT: s_mov_b32 s3, 0x31016000 914; GFX11-NEXT: v_mov_b32_e32 v0, s2 915; GFX11-NEXT: s_mov_b32 s2, -1 916; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 917; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 918; GFX11-NEXT: s_endpgm 919; 920; EG-LABEL: s_mul_i32: 921; EG: ; %bb.0: ; %entry 922; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 923; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 924; EG-NEXT: CF_END 925; EG-NEXT: PAD 926; EG-NEXT: ALU clause starting at 4: 927; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 928; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 929; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W, 930entry: 931 %mul = mul i32 %a, %b 932 store i32 %mul, ptr addrspace(1) %out, align 4 933 ret void 934} 935 936define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 937; SI-LABEL: v_mul_i32: 938; SI: ; %bb.0: ; %entry 939; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 940; SI-NEXT: s_mov_b32 s7, 0xf000 941; SI-NEXT: s_mov_b32 s6, -1 942; SI-NEXT: s_mov_b32 s10, s6 943; SI-NEXT: s_mov_b32 s11, s7 944; SI-NEXT: s_waitcnt lgkmcnt(0) 945; SI-NEXT: s_mov_b32 s8, s2 946; SI-NEXT: s_mov_b32 s9, s3 947; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 948; SI-NEXT: s_mov_b32 s4, s0 949; SI-NEXT: s_mov_b32 s5, s1 950; SI-NEXT: s_waitcnt vmcnt(0) 951; SI-NEXT: v_mul_lo_u32 v0, v0, v1 952; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 953; SI-NEXT: s_endpgm 954; 955; VI-LABEL: v_mul_i32: 956; VI: ; %bb.0: ; %entry 957; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 958; VI-NEXT: s_mov_b32 s7, 0xf000 959; VI-NEXT: s_mov_b32 s6, -1 960; VI-NEXT: s_mov_b32 s10, s6 961; VI-NEXT: s_mov_b32 s11, s7 962; VI-NEXT: s_waitcnt lgkmcnt(0) 963; VI-NEXT: s_mov_b32 s8, s2 964; VI-NEXT: s_mov_b32 s9, s3 965; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 966; VI-NEXT: s_mov_b32 s4, s0 967; VI-NEXT: s_mov_b32 s5, s1 968; VI-NEXT: s_waitcnt vmcnt(0) 969; VI-NEXT: v_mul_lo_u32 v0, v0, v1 970; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 971; VI-NEXT: s_endpgm 972; 973; GFX9-LABEL: v_mul_i32: 974; GFX9: ; %bb.0: ; %entry 975; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 976; GFX9-NEXT: s_mov_b32 s7, 0xf000 977; GFX9-NEXT: s_mov_b32 s6, -1 978; GFX9-NEXT: s_mov_b32 s10, s6 979; GFX9-NEXT: s_mov_b32 s11, s7 980; GFX9-NEXT: s_waitcnt lgkmcnt(0) 981; GFX9-NEXT: s_mov_b32 s8, s2 982; GFX9-NEXT: s_mov_b32 s9, s3 983; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 984; GFX9-NEXT: s_mov_b32 s4, s0 985; GFX9-NEXT: s_mov_b32 s5, s1 986; GFX9-NEXT: s_waitcnt vmcnt(0) 987; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 988; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 989; GFX9-NEXT: s_endpgm 990; 991; GFX10-LABEL: v_mul_i32: 992; GFX10: ; %bb.0: ; %entry 993; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 994; GFX10-NEXT: s_mov_b32 s6, -1 995; GFX10-NEXT: s_mov_b32 s7, 0x31016000 996; GFX10-NEXT: s_mov_b32 s10, s6 997; GFX10-NEXT: s_mov_b32 s11, s7 998; GFX10-NEXT: s_waitcnt lgkmcnt(0) 999; GFX10-NEXT: s_mov_b32 s8, s2 1000; GFX10-NEXT: s_mov_b32 s9, s3 1001; GFX10-NEXT: s_mov_b32 s4, s0 1002; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1003; GFX10-NEXT: s_mov_b32 s5, s1 1004; GFX10-NEXT: s_waitcnt vmcnt(0) 1005; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 1006; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 1007; GFX10-NEXT: s_endpgm 1008; 1009; GFX11-LABEL: v_mul_i32: 1010; GFX11: ; %bb.0: ; %entry 1011; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1012; GFX11-NEXT: s_mov_b32 s6, -1 1013; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1014; GFX11-NEXT: s_mov_b32 s10, s6 1015; GFX11-NEXT: s_mov_b32 s11, s7 1016; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX11-NEXT: s_mov_b32 s8, s2 1018; GFX11-NEXT: s_mov_b32 s9, s3 1019; GFX11-NEXT: s_mov_b32 s4, s0 1020; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 1021; GFX11-NEXT: s_mov_b32 s5, s1 1022; GFX11-NEXT: s_waitcnt vmcnt(0) 1023; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 1024; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 1025; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1026; GFX11-NEXT: s_endpgm 1027; 1028; EG-LABEL: v_mul_i32: 1029; EG: ; %bb.0: ; %entry 1030; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1031; EG-NEXT: TEX 0 @6 1032; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1033; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1034; EG-NEXT: CF_END 1035; EG-NEXT: PAD 1036; EG-NEXT: Fetch clause starting at 6: 1037; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1038; EG-NEXT: ALU clause starting at 8: 1039; EG-NEXT: MOV * T0.X, KC0[2].Z, 1040; EG-NEXT: ALU clause starting at 9: 1041; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1042; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y, 1043; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1044entry: 1045 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1046 %a = load i32, ptr addrspace(1) %in 1047 %b = load i32, ptr addrspace(1) %b_ptr 1048 %result = mul i32 %a, %b 1049 store i32 %result, ptr addrspace(1) %out 1050 ret void 1051} 1052 1053; A standard 64-bit multiply. The expansion should be around 6 instructions. 1054; It would be difficult to match the expansion correctly without writing 1055; a really complicated list of FileCheck expressions. I don't want 1056; to confuse people who may 'break' this test with a correct optimization, 1057; so this test just uses FUNC-LABEL to make sure the compiler does not 1058; crash with a 'failed to select' error. 1059 1060define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 1061; SI-LABEL: s_mul_i64: 1062; SI: ; %bb.0: ; %entry 1063; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1064; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1065; SI-NEXT: s_mov_b32 s3, 0xf000 1066; SI-NEXT: s_mov_b32 s2, -1 1067; SI-NEXT: s_waitcnt lgkmcnt(0) 1068; SI-NEXT: s_mov_b32 s0, s4 1069; SI-NEXT: v_mov_b32_e32 v0, s8 1070; SI-NEXT: v_mul_hi_u32 v0, s6, v0 1071; SI-NEXT: s_mul_i32 s4, s6, s9 1072; SI-NEXT: s_mov_b32 s1, s5 1073; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 1074; SI-NEXT: s_mul_i32 s4, s7, s8 1075; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0 1076; SI-NEXT: s_mul_i32 s4, s6, s8 1077; SI-NEXT: v_mov_b32_e32 v0, s4 1078; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1079; SI-NEXT: s_endpgm 1080; 1081; VI-LABEL: s_mul_i64: 1082; VI: ; %bb.0: ; %entry 1083; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1084; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1085; VI-NEXT: s_mov_b32 s3, 0xf000 1086; VI-NEXT: s_mov_b32 s2, -1 1087; VI-NEXT: s_waitcnt lgkmcnt(0) 1088; VI-NEXT: s_mov_b32 s0, s4 1089; VI-NEXT: v_mov_b32_e32 v0, s8 1090; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0 1091; VI-NEXT: s_mul_i32 s4, s6, s9 1092; VI-NEXT: s_mov_b32 s1, s5 1093; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1094; VI-NEXT: s_mul_i32 s4, s7, s8 1095; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1096; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1097; VI-NEXT: s_endpgm 1098; 1099; GFX9-LABEL: s_mul_i64: 1100; GFX9: ; %bb.0: ; %entry 1101; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1102; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1103; GFX9-NEXT: s_mov_b32 s3, 0xf000 1104; GFX9-NEXT: s_mov_b32 s2, -1 1105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX9-NEXT: s_mov_b32 s0, s4 1107; GFX9-NEXT: s_mov_b32 s1, s5 1108; GFX9-NEXT: s_mul_i32 s4, s6, s9 1109; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 1110; GFX9-NEXT: s_add_i32 s4, s5, s4 1111; GFX9-NEXT: s_mul_i32 s5, s7, s8 1112; GFX9-NEXT: s_add_i32 s4, s4, s5 1113; GFX9-NEXT: s_mul_i32 s5, s6, s8 1114; GFX9-NEXT: v_mov_b32_e32 v0, s5 1115; GFX9-NEXT: v_mov_b32_e32 v1, s4 1116; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1117; GFX9-NEXT: s_endpgm 1118; 1119; GFX10-LABEL: s_mul_i64: 1120; GFX10: ; %bb.0: ; %entry 1121; GFX10-NEXT: s_clause 0x1 1122; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1123; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1124; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX10-NEXT: s_mul_i32 s0, s6, s3 1126; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 1127; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1128; GFX10-NEXT: s_add_i32 s0, s1, s0 1129; GFX10-NEXT: s_mul_i32 s1, s7, s2 1130; GFX10-NEXT: s_mul_i32 s2, s6, s2 1131; GFX10-NEXT: s_add_i32 s0, s0, s1 1132; GFX10-NEXT: v_mov_b32_e32 v0, s2 1133; GFX10-NEXT: v_mov_b32_e32 v1, s0 1134; GFX10-NEXT: s_mov_b32 s2, -1 1135; GFX10-NEXT: s_mov_b32 s0, s4 1136; GFX10-NEXT: s_mov_b32 s1, s5 1137; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1138; GFX10-NEXT: s_endpgm 1139; 1140; GFX11-LABEL: s_mul_i64: 1141; GFX11: ; %bb.0: ; %entry 1142; GFX11-NEXT: s_clause 0x1 1143; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1144; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1145; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1146; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1147; GFX11-NEXT: s_mul_i32 s1, s6, s1 1148; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0 1149; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 1150; GFX11-NEXT: s_add_i32 s1, s2, s1 1151; GFX11-NEXT: s_mul_i32 s2, s7, s0 1152; GFX11-NEXT: s_mul_i32 s0, s6, s0 1153; GFX11-NEXT: s_add_i32 s1, s1, s2 1154; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1155; GFX11-NEXT: s_mov_b32 s2, -1 1156; GFX11-NEXT: s_mov_b32 s0, s4 1157; GFX11-NEXT: s_mov_b32 s1, s5 1158; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1159; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1160; GFX11-NEXT: s_endpgm 1161; 1162; EG-LABEL: s_mul_i64: 1163; EG: ; %bb.0: ; %entry 1164; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1165; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1166; EG-NEXT: CF_END 1167; EG-NEXT: PAD 1168; EG-NEXT: ALU clause starting at 4: 1169; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y, 1170; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z, 1171; EG-NEXT: ADD_INT T0.W, T0.X, PS, 1172; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y, 1173; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1174; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1175; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1176; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y, 1177entry: 1178 %mul = mul i64 %a, %b 1179 store i64 %mul, ptr addrspace(1) %out, align 8 1180 ret void 1181} 1182 1183define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 1184; SI-LABEL: v_mul_i64: 1185; SI: ; %bb.0: ; %entry 1186; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1187; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1188; SI-NEXT: s_mov_b32 s3, 0xf000 1189; SI-NEXT: s_mov_b32 s2, -1 1190; SI-NEXT: s_mov_b32 s10, s2 1191; SI-NEXT: s_mov_b32 s11, s3 1192; SI-NEXT: s_waitcnt lgkmcnt(0) 1193; SI-NEXT: s_mov_b32 s12, s6 1194; SI-NEXT: s_mov_b32 s13, s7 1195; SI-NEXT: s_mov_b32 s14, s2 1196; SI-NEXT: s_mov_b32 s15, s3 1197; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1198; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1199; SI-NEXT: s_mov_b32 s0, s4 1200; SI-NEXT: s_mov_b32 s1, s5 1201; SI-NEXT: s_waitcnt vmcnt(0) 1202; SI-NEXT: v_mul_lo_u32 v1, v2, v1 1203; SI-NEXT: v_mul_hi_u32 v4, v2, v0 1204; SI-NEXT: v_mul_lo_u32 v3, v3, v0 1205; SI-NEXT: v_mul_lo_u32 v0, v2, v0 1206; SI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1207; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1208; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1209; SI-NEXT: s_endpgm 1210; 1211; VI-LABEL: v_mul_i64: 1212; VI: ; %bb.0: ; %entry 1213; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1214; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1215; VI-NEXT: s_mov_b32 s3, 0xf000 1216; VI-NEXT: s_mov_b32 s2, -1 1217; VI-NEXT: s_mov_b32 s10, s2 1218; VI-NEXT: s_mov_b32 s11, s3 1219; VI-NEXT: s_waitcnt lgkmcnt(0) 1220; VI-NEXT: s_mov_b32 s12, s6 1221; VI-NEXT: s_mov_b32 s13, s7 1222; VI-NEXT: s_mov_b32 s14, s2 1223; VI-NEXT: s_mov_b32 s15, s3 1224; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1225; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1226; VI-NEXT: s_mov_b32 s0, s4 1227; VI-NEXT: s_mov_b32 s1, s5 1228; VI-NEXT: s_waitcnt vmcnt(0) 1229; VI-NEXT: v_mul_lo_u32 v4, v2, v1 1230; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 1231; VI-NEXT: v_mul_lo_u32 v0, v3, v0 1232; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 1233; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 1234; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 1235; VI-NEXT: s_endpgm 1236; 1237; GFX9-LABEL: v_mul_i64: 1238; GFX9: ; %bb.0: ; %entry 1239; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1240; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1241; GFX9-NEXT: s_mov_b32 s3, 0xf000 1242; GFX9-NEXT: s_mov_b32 s2, -1 1243; GFX9-NEXT: s_mov_b32 s10, s2 1244; GFX9-NEXT: s_mov_b32 s11, s3 1245; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1246; GFX9-NEXT: s_mov_b32 s12, s6 1247; GFX9-NEXT: s_mov_b32 s13, s7 1248; GFX9-NEXT: s_mov_b32 s14, s2 1249; GFX9-NEXT: s_mov_b32 s15, s3 1250; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1251; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1252; GFX9-NEXT: s_mov_b32 s0, s4 1253; GFX9-NEXT: s_mov_b32 s1, s5 1254; GFX9-NEXT: s_waitcnt vmcnt(0) 1255; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 1256; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 1257; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 1258; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 1259; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 1260; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1261; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1262; GFX9-NEXT: s_endpgm 1263; 1264; GFX10-LABEL: v_mul_i64: 1265; GFX10: ; %bb.0: ; %entry 1266; GFX10-NEXT: s_clause 0x1 1267; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1268; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1269; GFX10-NEXT: s_mov_b32 s2, -1 1270; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1271; GFX10-NEXT: s_mov_b32 s10, s2 1272; GFX10-NEXT: s_mov_b32 s11, s3 1273; GFX10-NEXT: s_mov_b32 s14, s2 1274; GFX10-NEXT: s_mov_b32 s15, s3 1275; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1276; GFX10-NEXT: s_mov_b32 s12, s6 1277; GFX10-NEXT: s_mov_b32 s13, s7 1278; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1279; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1280; GFX10-NEXT: s_mov_b32 s0, s4 1281; GFX10-NEXT: s_mov_b32 s1, s5 1282; GFX10-NEXT: s_waitcnt vmcnt(0) 1283; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 1284; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 1285; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 1286; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 1287; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 1288; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 1289; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1290; GFX10-NEXT: s_endpgm 1291; 1292; GFX11-LABEL: v_mul_i64: 1293; GFX11: ; %bb.0: ; %entry 1294; GFX11-NEXT: s_clause 0x1 1295; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1296; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1297; GFX11-NEXT: s_mov_b32 s10, -1 1298; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1299; GFX11-NEXT: s_mov_b32 s2, s10 1300; GFX11-NEXT: s_mov_b32 s3, s11 1301; GFX11-NEXT: s_mov_b32 s14, s10 1302; GFX11-NEXT: s_mov_b32 s15, s11 1303; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX11-NEXT: s_mov_b32 s12, s6 1305; GFX11-NEXT: s_mov_b32 s13, s7 1306; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 1307; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 1308; GFX11-NEXT: s_mov_b32 s8, s4 1309; GFX11-NEXT: s_mov_b32 s9, s5 1310; GFX11-NEXT: s_waitcnt vmcnt(0) 1311; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 1312; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 1313; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 1314; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 1315; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 1316; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 1317; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 1318; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 1319; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1320; GFX11-NEXT: s_endpgm 1321; 1322; EG-LABEL: v_mul_i64: 1323; EG: ; %bb.0: ; %entry 1324; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 1325; EG-NEXT: TEX 1 @6 1326; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[] 1327; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1 1328; EG-NEXT: CF_END 1329; EG-NEXT: PAD 1330; EG-NEXT: Fetch clause starting at 6: 1331; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 1332; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1333; EG-NEXT: ALU clause starting at 10: 1334; EG-NEXT: MOV T0.X, KC0[2].Z, 1335; EG-NEXT: MOV * T1.X, KC0[2].W, 1336; EG-NEXT: ALU clause starting at 12: 1337; EG-NEXT: MULHI * T0.Z, T0.X, T1.X, 1338; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y, 1339; EG-NEXT: ADD_INT T0.W, T0.Z, PS, 1340; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X, 1341; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1342; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 1343; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 1344; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1345entry: 1346 %a = load i64, ptr addrspace(1) %aptr, align 8 1347 %b = load i64, ptr addrspace(1) %bptr, align 8 1348 %mul = mul i64 %a, %b 1349 store i64 %mul, ptr addrspace(1) %out, align 8 1350 ret void 1351} 1352 1353define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { 1354; SI-LABEL: mul32_in_branch: 1355; SI: ; %bb.0: ; %entry 1356; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 1357; SI-NEXT: s_waitcnt lgkmcnt(0) 1358; SI-NEXT: s_cmp_lg_u32 s2, 0 1359; SI-NEXT: s_cbranch_scc0 .LBB11_2 1360; SI-NEXT: ; %bb.1: ; %else 1361; SI-NEXT: s_mul_i32 s6, s2, s3 1362; SI-NEXT: s_mov_b64 s[4:5], 0 1363; SI-NEXT: s_branch .LBB11_3 1364; SI-NEXT: .LBB11_2: 1365; SI-NEXT: s_mov_b64 s[4:5], -1 1366; SI-NEXT: ; implicit-def: $sgpr6 1367; SI-NEXT: .LBB11_3: ; %Flow 1368; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1369; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1370; SI-NEXT: s_waitcnt lgkmcnt(0) 1371; SI-NEXT: s_mov_b64 vcc, vcc 1372; SI-NEXT: s_cbranch_vccnz .LBB11_5 1373; SI-NEXT: ; %bb.4: ; %if 1374; SI-NEXT: s_mov_b32 s7, 0xf000 1375; SI-NEXT: s_mov_b32 s6, -1 1376; SI-NEXT: s_mov_b32 s4, s2 1377; SI-NEXT: s_mov_b32 s5, s3 1378; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1379; SI-NEXT: s_branch .LBB11_6 1380; SI-NEXT: .LBB11_5: 1381; SI-NEXT: v_mov_b32_e32 v0, s6 1382; SI-NEXT: .LBB11_6: ; %endif 1383; SI-NEXT: s_mov_b32 s3, 0xf000 1384; SI-NEXT: s_mov_b32 s2, -1 1385; SI-NEXT: s_waitcnt vmcnt(0) 1386; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1387; SI-NEXT: s_endpgm 1388; 1389; VI-LABEL: mul32_in_branch: 1390; VI: ; %bb.0: ; %entry 1391; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1392; VI-NEXT: s_waitcnt lgkmcnt(0) 1393; VI-NEXT: s_cmp_lg_u32 s2, 0 1394; VI-NEXT: s_cbranch_scc0 .LBB11_2 1395; VI-NEXT: ; %bb.1: ; %else 1396; VI-NEXT: s_mul_i32 s6, s2, s3 1397; VI-NEXT: s_mov_b64 s[4:5], 0 1398; VI-NEXT: s_branch .LBB11_3 1399; VI-NEXT: .LBB11_2: 1400; VI-NEXT: s_mov_b64 s[4:5], -1 1401; VI-NEXT: ; implicit-def: $sgpr6 1402; VI-NEXT: .LBB11_3: ; %Flow 1403; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1404; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1405; VI-NEXT: s_cbranch_vccnz .LBB11_5 1406; VI-NEXT: ; %bb.4: ; %if 1407; VI-NEXT: s_mov_b32 s7, 0xf000 1408; VI-NEXT: s_mov_b32 s6, -1 1409; VI-NEXT: s_waitcnt lgkmcnt(0) 1410; VI-NEXT: s_mov_b32 s4, s2 1411; VI-NEXT: s_mov_b32 s5, s3 1412; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1413; VI-NEXT: s_branch .LBB11_6 1414; VI-NEXT: .LBB11_5: 1415; VI-NEXT: v_mov_b32_e32 v0, s6 1416; VI-NEXT: .LBB11_6: ; %endif 1417; VI-NEXT: s_waitcnt lgkmcnt(0) 1418; VI-NEXT: s_mov_b32 s3, 0xf000 1419; VI-NEXT: s_mov_b32 s2, -1 1420; VI-NEXT: s_waitcnt vmcnt(0) 1421; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1422; VI-NEXT: s_endpgm 1423; 1424; GFX9-LABEL: mul32_in_branch: 1425; GFX9: ; %bb.0: ; %entry 1426; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1427; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1428; GFX9-NEXT: s_cmp_lg_u32 s2, 0 1429; GFX9-NEXT: s_cbranch_scc0 .LBB11_2 1430; GFX9-NEXT: ; %bb.1: ; %else 1431; GFX9-NEXT: s_mul_i32 s6, s2, s3 1432; GFX9-NEXT: s_mov_b64 s[4:5], 0 1433; GFX9-NEXT: s_branch .LBB11_3 1434; GFX9-NEXT: .LBB11_2: 1435; GFX9-NEXT: s_mov_b64 s[4:5], -1 1436; GFX9-NEXT: ; implicit-def: $sgpr6 1437; GFX9-NEXT: .LBB11_3: ; %Flow 1438; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1439; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1440; GFX9-NEXT: s_cbranch_vccnz .LBB11_5 1441; GFX9-NEXT: ; %bb.4: ; %if 1442; GFX9-NEXT: s_mov_b32 s7, 0xf000 1443; GFX9-NEXT: s_mov_b32 s6, -1 1444; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX9-NEXT: s_mov_b32 s4, s2 1446; GFX9-NEXT: s_mov_b32 s5, s3 1447; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 1448; GFX9-NEXT: s_branch .LBB11_6 1449; GFX9-NEXT: .LBB11_5: 1450; GFX9-NEXT: v_mov_b32_e32 v0, s6 1451; GFX9-NEXT: .LBB11_6: ; %endif 1452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1453; GFX9-NEXT: s_mov_b32 s3, 0xf000 1454; GFX9-NEXT: s_mov_b32 s2, -1 1455; GFX9-NEXT: s_waitcnt vmcnt(0) 1456; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1457; GFX9-NEXT: s_endpgm 1458; 1459; GFX10-LABEL: mul32_in_branch: 1460; GFX10: ; %bb.0: ; %entry 1461; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1462; GFX10-NEXT: s_mov_b32 s4, 0 1463; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1464; GFX10-NEXT: s_cmp_lg_u32 s2, 0 1465; GFX10-NEXT: s_cbranch_scc0 .LBB11_2 1466; GFX10-NEXT: ; %bb.1: ; %else 1467; GFX10-NEXT: s_mul_i32 s5, s2, s3 1468; GFX10-NEXT: s_branch .LBB11_3 1469; GFX10-NEXT: .LBB11_2: 1470; GFX10-NEXT: s_mov_b32 s4, -1 1471; GFX10-NEXT: ; implicit-def: $sgpr5 1472; GFX10-NEXT: .LBB11_3: ; %Flow 1473; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1474; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 1475; GFX10-NEXT: s_cbranch_vccnz .LBB11_5 1476; GFX10-NEXT: ; %bb.4: ; %if 1477; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1478; GFX10-NEXT: s_mov_b32 s6, -1 1479; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1480; GFX10-NEXT: s_mov_b32 s4, s2 1481; GFX10-NEXT: s_mov_b32 s5, s3 1482; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 1483; GFX10-NEXT: s_branch .LBB11_6 1484; GFX10-NEXT: .LBB11_5: 1485; GFX10-NEXT: v_mov_b32_e32 v0, s5 1486; GFX10-NEXT: .LBB11_6: ; %endif 1487; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1488; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1489; GFX10-NEXT: s_mov_b32 s2, -1 1490; GFX10-NEXT: s_waitcnt vmcnt(0) 1491; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 1492; GFX10-NEXT: s_endpgm 1493; 1494; GFX11-LABEL: mul32_in_branch: 1495; GFX11: ; %bb.0: ; %entry 1496; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 1497; GFX11-NEXT: s_mov_b32 s4, 0 1498; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX11-NEXT: s_cmp_lg_u32 s2, 0 1500; GFX11-NEXT: s_cbranch_scc0 .LBB11_2 1501; GFX11-NEXT: ; %bb.1: ; %else 1502; GFX11-NEXT: s_mul_i32 s5, s2, s3 1503; GFX11-NEXT: s_branch .LBB11_3 1504; GFX11-NEXT: .LBB11_2: 1505; GFX11-NEXT: s_mov_b32 s4, -1 1506; GFX11-NEXT: ; implicit-def: $sgpr5 1507; GFX11-NEXT: .LBB11_3: ; %Flow 1508; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1509; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 1510; GFX11-NEXT: s_cbranch_vccnz .LBB11_5 1511; GFX11-NEXT: ; %bb.4: ; %if 1512; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1513; GFX11-NEXT: s_mov_b32 s6, -1 1514; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX11-NEXT: s_mov_b32 s4, s2 1516; GFX11-NEXT: s_mov_b32 s5, s3 1517; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 1518; GFX11-NEXT: s_branch .LBB11_6 1519; GFX11-NEXT: .LBB11_5: 1520; GFX11-NEXT: v_mov_b32_e32 v0, s5 1521; GFX11-NEXT: .LBB11_6: ; %endif 1522; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1523; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1524; GFX11-NEXT: s_mov_b32 s2, -1 1525; GFX11-NEXT: s_waitcnt vmcnt(0) 1526; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1527; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1528; GFX11-NEXT: s_endpgm 1529; 1530; EG-LABEL: mul32_in_branch: 1531; EG: ; %bb.0: ; %entry 1532; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] 1533; EG-NEXT: JUMP @3 POP:1 1534; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[] 1535; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[] 1536; EG-NEXT: JUMP @8 POP:1 1537; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] 1538; EG-NEXT: TEX 0 @12 1539; EG-NEXT: POP @8 POP:1 1540; EG-NEXT: ALU 1, @27, KC0[], KC1[] 1541; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1542; EG-NEXT: CF_END 1543; EG-NEXT: PAD 1544; EG-NEXT: Fetch clause starting at 12: 1545; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1546; EG-NEXT: ALU clause starting at 14: 1547; EG-NEXT: MOV T1.W, literal.x, 1548; EG-NEXT: SETNE_INT * T0.W, KC0[2].W, 0.0, 1549; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1550; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1551; EG-NEXT: ALU clause starting at 18: 1552; EG-NEXT: MOV T0.W, KC0[2].W, 1553; EG-NEXT: MOV * T2.W, KC0[3].X, 1554; EG-NEXT: MOV T1.W, literal.x, 1555; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 1556; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1557; EG-NEXT: ALU clause starting at 23: 1558; EG-NEXT: MOV T0.W, KC0[2].Y, 1559; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 1560; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1561; EG-NEXT: ALU clause starting at 26: 1562; EG-NEXT: MOV * T0.X, KC0[2].Z, 1563; EG-NEXT: ALU clause starting at 27: 1564; EG-NEXT: LSHR * T1.X, T0.W, literal.x, 1565; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1566entry: 1567 %0 = icmp eq i32 %a, 0 1568 br i1 %0, label %if, label %else 1569 1570if: 1571 %1 = load i32, ptr addrspace(1) %in 1572 br label %endif 1573 1574else: 1575 %2 = mul i32 %a, %b 1576 br label %endif 1577 1578endif: 1579 %3 = phi i32 [%1, %if], [%2, %else] 1580 store i32 %3, ptr addrspace(1) %out 1581 ret void 1582} 1583 1584define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { 1585; SI-LABEL: mul64_in_branch: 1586; SI: ; %bb.0: ; %entry 1587; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 1588; SI-NEXT: s_mov_b64 s[8:9], 0 1589; SI-NEXT: s_waitcnt lgkmcnt(0) 1590; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 1591; SI-NEXT: s_and_b64 vcc, exec, s[10:11] 1592; SI-NEXT: s_cbranch_vccz .LBB12_4 1593; SI-NEXT: ; %bb.1: ; %else 1594; SI-NEXT: v_mov_b32_e32 v0, s6 1595; SI-NEXT: v_mul_hi_u32 v0, s4, v0 1596; SI-NEXT: s_mul_i32 s7, s4, s7 1597; SI-NEXT: s_mul_i32 s5, s5, s6 1598; SI-NEXT: s_mul_i32 s4, s4, s6 1599; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 1600; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 1601; SI-NEXT: v_mov_b32_e32 v0, s4 1602; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1603; SI-NEXT: s_cbranch_vccnz .LBB12_3 1604; SI-NEXT: .LBB12_2: ; %if 1605; SI-NEXT: s_mov_b32 s7, 0xf000 1606; SI-NEXT: s_mov_b32 s6, -1 1607; SI-NEXT: s_mov_b32 s4, s2 1608; SI-NEXT: s_mov_b32 s5, s3 1609; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1610; SI-NEXT: .LBB12_3: ; %endif 1611; SI-NEXT: s_mov_b32 s3, 0xf000 1612; SI-NEXT: s_mov_b32 s2, -1 1613; SI-NEXT: s_waitcnt vmcnt(0) 1614; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1615; SI-NEXT: s_endpgm 1616; SI-NEXT: .LBB12_4: 1617; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 1618; SI-NEXT: s_branch .LBB12_2 1619; 1620; VI-LABEL: mul64_in_branch: 1621; VI: ; %bb.0: ; %entry 1622; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1623; VI-NEXT: s_mov_b64 s[8:9], 0 1624; VI-NEXT: s_waitcnt lgkmcnt(0) 1625; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 1626; VI-NEXT: s_cbranch_scc0 .LBB12_4 1627; VI-NEXT: ; %bb.1: ; %else 1628; VI-NEXT: v_mov_b32_e32 v0, s6 1629; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 1630; VI-NEXT: s_mul_i32 s4, s4, s7 1631; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1632; VI-NEXT: s_mul_i32 s4, s5, s6 1633; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1634; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1635; VI-NEXT: s_cbranch_vccnz .LBB12_3 1636; VI-NEXT: .LBB12_2: ; %if 1637; VI-NEXT: s_mov_b32 s7, 0xf000 1638; VI-NEXT: s_mov_b32 s6, -1 1639; VI-NEXT: s_mov_b32 s4, s2 1640; VI-NEXT: s_mov_b32 s5, s3 1641; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1642; VI-NEXT: .LBB12_3: ; %endif 1643; VI-NEXT: s_mov_b32 s3, 0xf000 1644; VI-NEXT: s_mov_b32 s2, -1 1645; VI-NEXT: s_waitcnt vmcnt(0) 1646; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1647; VI-NEXT: s_endpgm 1648; VI-NEXT: .LBB12_4: 1649; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 1650; VI-NEXT: s_branch .LBB12_2 1651; 1652; GFX9-LABEL: mul64_in_branch: 1653; GFX9: ; %bb.0: ; %entry 1654; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1655; GFX9-NEXT: s_mov_b64 s[8:9], 0 1656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 1658; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 1659; GFX9-NEXT: ; %bb.1: ; %else 1660; GFX9-NEXT: s_mul_i32 s7, s4, s7 1661; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 1662; GFX9-NEXT: s_add_i32 s7, s10, s7 1663; GFX9-NEXT: s_mul_i32 s5, s5, s6 1664; GFX9-NEXT: s_add_i32 s5, s7, s5 1665; GFX9-NEXT: s_mul_i32 s4, s4, s6 1666; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1667; GFX9-NEXT: s_cbranch_vccnz .LBB12_4 1668; GFX9-NEXT: .LBB12_2: ; %if 1669; GFX9-NEXT: s_mov_b32 s7, 0xf000 1670; GFX9-NEXT: s_mov_b32 s6, -1 1671; GFX9-NEXT: s_mov_b32 s4, s2 1672; GFX9-NEXT: s_mov_b32 s5, s3 1673; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1674; GFX9-NEXT: s_branch .LBB12_5 1675; GFX9-NEXT: .LBB12_3: 1676; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 1677; GFX9-NEXT: s_branch .LBB12_2 1678; GFX9-NEXT: .LBB12_4: 1679; GFX9-NEXT: v_mov_b32_e32 v0, s4 1680; GFX9-NEXT: v_mov_b32_e32 v1, s5 1681; GFX9-NEXT: .LBB12_5: ; %endif 1682; GFX9-NEXT: s_mov_b32 s3, 0xf000 1683; GFX9-NEXT: s_mov_b32 s2, -1 1684; GFX9-NEXT: s_waitcnt vmcnt(0) 1685; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1686; GFX9-NEXT: s_endpgm 1687; 1688; GFX10-LABEL: mul64_in_branch: 1689; GFX10: ; %bb.0: ; %entry 1690; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1691; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1692; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 1693; GFX10-NEXT: s_cbranch_scc0 .LBB12_3 1694; GFX10-NEXT: ; %bb.1: ; %else 1695; GFX10-NEXT: s_mul_i32 s7, s4, s7 1696; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 1697; GFX10-NEXT: s_mul_i32 s5, s5, s6 1698; GFX10-NEXT: s_add_i32 s7, s8, s7 1699; GFX10-NEXT: s_mul_i32 s4, s4, s6 1700; GFX10-NEXT: s_add_i32 s5, s7, s5 1701; GFX10-NEXT: s_mov_b32 s6, 0 1702; GFX10-NEXT: s_cbranch_execnz .LBB12_4 1703; GFX10-NEXT: .LBB12_2: ; %if 1704; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1705; GFX10-NEXT: s_mov_b32 s6, -1 1706; GFX10-NEXT: s_mov_b32 s4, s2 1707; GFX10-NEXT: s_mov_b32 s5, s3 1708; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1709; GFX10-NEXT: s_branch .LBB12_5 1710; GFX10-NEXT: .LBB12_3: 1711; GFX10-NEXT: s_mov_b32 s6, -1 1712; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 1713; GFX10-NEXT: s_branch .LBB12_2 1714; GFX10-NEXT: .LBB12_4: 1715; GFX10-NEXT: v_mov_b32_e32 v0, s4 1716; GFX10-NEXT: v_mov_b32_e32 v1, s5 1717; GFX10-NEXT: .LBB12_5: ; %endif 1718; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1719; GFX10-NEXT: s_mov_b32 s2, -1 1720; GFX10-NEXT: s_waitcnt vmcnt(0) 1721; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1722; GFX10-NEXT: s_endpgm 1723; 1724; GFX11-LABEL: mul64_in_branch: 1725; GFX11: ; %bb.0: ; %entry 1726; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 1727; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1728; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 1729; GFX11-NEXT: s_cbranch_scc0 .LBB12_3 1730; GFX11-NEXT: ; %bb.1: ; %else 1731; GFX11-NEXT: s_mul_i32 s7, s4, s7 1732; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 1733; GFX11-NEXT: s_mul_i32 s5, s5, s6 1734; GFX11-NEXT: s_add_i32 s7, s8, s7 1735; GFX11-NEXT: s_mul_i32 s4, s4, s6 1736; GFX11-NEXT: s_add_i32 s5, s7, s5 1737; GFX11-NEXT: s_mov_b32 s6, 0 1738; GFX11-NEXT: s_cbranch_execnz .LBB12_4 1739; GFX11-NEXT: .LBB12_2: ; %if 1740; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1741; GFX11-NEXT: s_mov_b32 s6, -1 1742; GFX11-NEXT: s_mov_b32 s4, s2 1743; GFX11-NEXT: s_mov_b32 s5, s3 1744; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 1745; GFX11-NEXT: s_branch .LBB12_5 1746; GFX11-NEXT: .LBB12_3: 1747; GFX11-NEXT: s_mov_b32 s6, -1 1748; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 1749; GFX11-NEXT: s_branch .LBB12_2 1750; GFX11-NEXT: .LBB12_4: 1751; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 1752; GFX11-NEXT: .LBB12_5: ; %endif 1753; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1754; GFX11-NEXT: s_mov_b32 s2, -1 1755; GFX11-NEXT: s_waitcnt vmcnt(0) 1756; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1757; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1758; GFX11-NEXT: s_endpgm 1759; 1760; EG-LABEL: mul64_in_branch: 1761; EG: ; %bb.0: ; %entry 1762; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] 1763; EG-NEXT: JUMP @3 POP:1 1764; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[] 1765; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 1766; EG-NEXT: JUMP @8 POP:1 1767; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] 1768; EG-NEXT: TEX 0 @12 1769; EG-NEXT: POP @8 POP:1 1770; EG-NEXT: ALU 1, @35, KC0[], KC1[] 1771; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1772; EG-NEXT: CF_END 1773; EG-NEXT: PAD 1774; EG-NEXT: Fetch clause starting at 12: 1775; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1776; EG-NEXT: ALU clause starting at 14: 1777; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, 1778; EG-NEXT: MOV * T1.W, literal.x, 1779; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1780; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 1781; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 1782; EG-NEXT: ALU clause starting at 19: 1783; EG-NEXT: MOV T0.W, KC0[2].W, 1784; EG-NEXT: MOV * T1.W, KC0[3].Z, 1785; EG-NEXT: MOV T2.W, KC0[3].Y, 1786; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 1787; EG-NEXT: MOV T1.W, KC0[3].X, 1788; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, 1789; EG-NEXT: ADD_INT T3.W, PS, T0.X, 1790; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, 1791; EG-NEXT: ADD_INT T0.Y, PV.W, PS, 1792; EG-NEXT: MOV T1.W, literal.x, 1793; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, 1794; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1795; EG-NEXT: ALU clause starting at 31: 1796; EG-NEXT: MOV T0.W, KC0[2].Y, 1797; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 1798; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1799; EG-NEXT: ALU clause starting at 34: 1800; EG-NEXT: MOV * T0.X, KC0[2].Z, 1801; EG-NEXT: ALU clause starting at 35: 1802; EG-NEXT: LSHR * T1.X, T0.W, literal.x, 1803; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1804entry: 1805 %0 = icmp eq i64 %a, 0 1806 br i1 %0, label %if, label %else 1807 1808if: 1809 %1 = load i64, ptr addrspace(1) %in 1810 br label %endif 1811 1812else: 1813 %2 = mul i64 %a, %b 1814 br label %endif 1815 1816endif: 1817 %3 = phi i64 [%1, %if], [%2, %else] 1818 store i64 %3, ptr addrspace(1) %out 1819 ret void 1820} 1821 1822define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { 1823; SI-LABEL: s_mul_i128: 1824; SI: ; %bb.0: ; %entry 1825; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 1826; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f 1827; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1828; SI-NEXT: s_mov_b32 s3, 0xf000 1829; SI-NEXT: s_mov_b32 s2, -1 1830; SI-NEXT: s_waitcnt lgkmcnt(0) 1831; SI-NEXT: v_mov_b32_e32 v0, s6 1832; SI-NEXT: v_mul_hi_u32 v0, s8, v0 1833; SI-NEXT: v_mov_b32_e32 v1, s4 1834; SI-NEXT: v_mul_hi_u32 v1, s10, v1 1835; SI-NEXT: s_mul_i32 s7, s8, s7 1836; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 1837; SI-NEXT: s_mul_i32 s7, s10, s5 1838; SI-NEXT: s_mul_i32 s12, s9, s6 1839; SI-NEXT: s_mul_i32 s6, s8, s6 1840; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 1841; SI-NEXT: s_mul_i32 s7, s11, s4 1842; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 1843; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 1844; SI-NEXT: s_mul_i32 s7, s10, s4 1845; SI-NEXT: v_mov_b32_e32 v2, s6 1846; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 1847; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 1848; SI-NEXT: v_mov_b32_e32 v1, s8 1849; SI-NEXT: v_mul_hi_u32 v5, s4, v1 1850; SI-NEXT: v_mul_hi_u32 v1, s5, v1 1851; SI-NEXT: v_mov_b32_e32 v3, s9 1852; SI-NEXT: v_mul_hi_u32 v4, s4, v3 1853; SI-NEXT: s_mul_i32 s7, s5, s8 1854; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 1855; SI-NEXT: s_mul_i32 s6, s4, s9 1856; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 1857; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5 1858; SI-NEXT: v_mul_hi_u32 v3, s5, v3 1859; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 1860; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 1861; SI-NEXT: s_mul_i32 s5, s5, s9 1862; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc 1863; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 1864; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 1865; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1866; SI-NEXT: s_mul_i32 s4, s4, s8 1867; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc 1868; SI-NEXT: v_mov_b32_e32 v0, s4 1869; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1870; SI-NEXT: s_endpgm 1871; 1872; VI-LABEL: s_mul_i128: 1873; VI: ; %bb.0: ; %entry 1874; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 1875; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 1876; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1877; VI-NEXT: v_mov_b32_e32 v5, 0 1878; VI-NEXT: s_mov_b32 s3, 0xf000 1879; VI-NEXT: s_waitcnt lgkmcnt(0) 1880; VI-NEXT: v_mov_b32_e32 v0, s6 1881; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 1882; VI-NEXT: s_mul_i32 s7, s8, s7 1883; VI-NEXT: v_mov_b32_e32 v6, s8 1884; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 1885; VI-NEXT: s_mul_i32 s12, s9, s6 1886; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0 1887; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 1888; VI-NEXT: v_mov_b32_e32 v4, v1 1889; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5] 1890; VI-NEXT: v_mov_b32_e32 v8, s4 1891; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3] 1892; VI-NEXT: v_mov_b32_e32 v3, v7 1893; VI-NEXT: v_mov_b32_e32 v7, v5 1894; VI-NEXT: v_mov_b32_e32 v8, s9 1895; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7] 1896; VI-NEXT: s_mul_i32 s8, s11, s4 1897; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2 1898; VI-NEXT: v_mov_b32_e32 v2, v5 1899; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 1900; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc 1901; VI-NEXT: s_mul_i32 s8, s10, s5 1902; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3] 1903; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6 1904; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 1905; VI-NEXT: s_mov_b32 s2, -1 1906; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 1907; VI-NEXT: v_mov_b32_e32 v1, v4 1908; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1909; VI-NEXT: s_endpgm 1910; 1911; GFX9-LABEL: s_mul_i128: 1912; GFX9: ; %bb.0: ; %entry 1913; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 1914; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 1915; GFX9-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24 1916; GFX9-NEXT: s_mov_b32 s15, 0xf000 1917; GFX9-NEXT: s_mov_b32 s14, -1 1918; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1919; GFX9-NEXT: s_mul_i32 s0, s8, s7 1920; GFX9-NEXT: s_mul_hi_u32 s1, s8, s6 1921; GFX9-NEXT: s_mul_i32 s2, s10, s5 1922; GFX9-NEXT: s_mul_hi_u32 s3, s10, s4 1923; GFX9-NEXT: s_add_i32 s0, s1, s0 1924; GFX9-NEXT: s_mul_i32 s1, s9, s6 1925; GFX9-NEXT: s_add_i32 s2, s3, s2 1926; GFX9-NEXT: s_mul_i32 s3, s11, s4 1927; GFX9-NEXT: s_add_i32 s0, s0, s1 1928; GFX9-NEXT: s_mul_i32 s1, s8, s6 1929; GFX9-NEXT: s_add_i32 s2, s2, s3 1930; GFX9-NEXT: s_mul_i32 s3, s10, s4 1931; GFX9-NEXT: s_add_u32 s3, s3, s1 1932; GFX9-NEXT: s_addc_u32 s2, s2, s0 1933; GFX9-NEXT: s_mul_i32 s10, s5, s8 1934; GFX9-NEXT: s_mul_hi_u32 s11, s4, s8 1935; GFX9-NEXT: s_mul_hi_u32 s7, s5, s8 1936; GFX9-NEXT: s_add_u32 s10, s10, s11 1937; GFX9-NEXT: s_mul_i32 s1, s4, s9 1938; GFX9-NEXT: s_addc_u32 s7, s7, 0 1939; GFX9-NEXT: s_mul_hi_u32 s6, s4, s9 1940; GFX9-NEXT: s_add_u32 s1, s1, s10 1941; GFX9-NEXT: s_addc_u32 s6, s6, 0 1942; GFX9-NEXT: s_add_u32 s6, s7, s6 1943; GFX9-NEXT: s_addc_u32 s7, 0, 0 1944; GFX9-NEXT: s_mul_hi_u32 s10, s5, s9 1945; GFX9-NEXT: s_mul_i32 s5, s5, s9 1946; GFX9-NEXT: s_add_u32 s5, s5, s6 1947; GFX9-NEXT: s_addc_u32 s6, s10, s7 1948; GFX9-NEXT: s_mov_b32 s0, 0 1949; GFX9-NEXT: s_add_u32 s5, s5, s3 1950; GFX9-NEXT: s_addc_u32 s6, s6, s2 1951; GFX9-NEXT: s_mul_i32 s2, s4, s8 1952; GFX9-NEXT: s_mov_b32 s3, s0 1953; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 1954; GFX9-NEXT: v_mov_b32_e32 v0, s0 1955; GFX9-NEXT: v_mov_b32_e32 v1, s1 1956; GFX9-NEXT: v_mov_b32_e32 v2, s5 1957; GFX9-NEXT: v_mov_b32_e32 v3, s6 1958; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1959; GFX9-NEXT: s_endpgm 1960; 1961; GFX10-LABEL: s_mul_i128: 1962; GFX10: ; %bb.0: ; %entry 1963; GFX10-NEXT: s_clause 0x1 1964; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 1965; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 1966; GFX10-NEXT: s_mov_b32 s2, 0 1967; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1968; GFX10-NEXT: s_mov_b32 s13, s2 1969; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX10-NEXT: s_mul_i32 s3, s8, s7 1971; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 1972; GFX10-NEXT: s_mul_i32 s14, s10, s5 1973; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 1974; GFX10-NEXT: s_mul_i32 s12, s9, s6 1975; GFX10-NEXT: s_mul_i32 s11, s11, s4 1976; GFX10-NEXT: s_add_i32 s3, s7, s3 1977; GFX10-NEXT: s_add_i32 s7, s15, s14 1978; GFX10-NEXT: s_mul_i32 s6, s8, s6 1979; GFX10-NEXT: s_mul_i32 s10, s10, s4 1980; GFX10-NEXT: s_add_i32 s3, s3, s12 1981; GFX10-NEXT: s_add_i32 s7, s7, s11 1982; GFX10-NEXT: s_mul_i32 s19, s5, s8 1983; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 1984; GFX10-NEXT: s_add_u32 s6, s10, s6 1985; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 1986; GFX10-NEXT: s_addc_u32 s7, s7, s3 1987; GFX10-NEXT: s_mul_i32 s17, s4, s9 1988; GFX10-NEXT: s_add_u32 s3, s19, s20 1989; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 1990; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 1991; GFX10-NEXT: s_mul_i32 s5, s5, s9 1992; GFX10-NEXT: s_addc_u32 s9, s18, 0 1993; GFX10-NEXT: s_add_u32 s3, s17, s3 1994; GFX10-NEXT: s_addc_u32 s10, s16, 0 1995; GFX10-NEXT: s_mul_i32 s12, s4, s8 1996; GFX10-NEXT: s_add_u32 s4, s9, s10 1997; GFX10-NEXT: s_addc_u32 s8, 0, 0 1998; GFX10-NEXT: s_add_u32 s4, s5, s4 1999; GFX10-NEXT: s_addc_u32 s5, s21, s8 2000; GFX10-NEXT: s_add_u32 s4, s4, s6 2001; GFX10-NEXT: s_addc_u32 s5, s5, s7 2002; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2003; GFX10-NEXT: v_mov_b32_e32 v2, s4 2004; GFX10-NEXT: v_mov_b32_e32 v0, s2 2005; GFX10-NEXT: v_mov_b32_e32 v1, s3 2006; GFX10-NEXT: v_mov_b32_e32 v3, s5 2007; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2008; GFX10-NEXT: s_mov_b32 s2, -1 2009; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2010; GFX10-NEXT: s_endpgm 2011; 2012; GFX11-LABEL: s_mul_i128: 2013; GFX11: ; %bb.0: ; %entry 2014; GFX11-NEXT: s_clause 0x2 2015; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c 2016; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c 2017; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2018; GFX11-NEXT: s_mov_b32 s2, 0 2019; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2020; GFX11-NEXT: s_mov_b32 s13, s2 2021; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2022; GFX11-NEXT: s_mul_i32 s3, s8, s7 2023; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 2024; GFX11-NEXT: s_mul_i32 s14, s10, s5 2025; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 2026; GFX11-NEXT: s_mul_i32 s12, s9, s6 2027; GFX11-NEXT: s_mul_i32 s11, s11, s4 2028; GFX11-NEXT: s_add_i32 s3, s7, s3 2029; GFX11-NEXT: s_add_i32 s7, s15, s14 2030; GFX11-NEXT: s_mul_i32 s6, s8, s6 2031; GFX11-NEXT: s_mul_i32 s10, s10, s4 2032; GFX11-NEXT: s_add_i32 s3, s3, s12 2033; GFX11-NEXT: s_add_i32 s7, s7, s11 2034; GFX11-NEXT: s_mul_i32 s19, s5, s8 2035; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 2036; GFX11-NEXT: s_add_u32 s6, s10, s6 2037; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 2038; GFX11-NEXT: s_addc_u32 s7, s7, s3 2039; GFX11-NEXT: s_mul_i32 s17, s4, s9 2040; GFX11-NEXT: s_add_u32 s3, s19, s20 2041; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 2042; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 2043; GFX11-NEXT: s_mul_i32 s5, s5, s9 2044; GFX11-NEXT: s_addc_u32 s9, s18, 0 2045; GFX11-NEXT: s_add_u32 s3, s17, s3 2046; GFX11-NEXT: s_addc_u32 s10, s16, 0 2047; GFX11-NEXT: s_mul_i32 s12, s4, s8 2048; GFX11-NEXT: s_add_u32 s4, s9, s10 2049; GFX11-NEXT: s_addc_u32 s8, 0, 0 2050; GFX11-NEXT: s_add_u32 s4, s5, s4 2051; GFX11-NEXT: s_addc_u32 s5, s21, s8 2052; GFX11-NEXT: s_add_u32 s4, s4, s6 2053; GFX11-NEXT: s_addc_u32 s5, s5, s7 2054; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2055; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2056; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 2057; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 2058; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2059; GFX11-NEXT: s_mov_b32 s2, -1 2060; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 2061; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2062; GFX11-NEXT: s_endpgm 2063; 2064; EG-LABEL: s_mul_i128: 2065; EG: ; %bb.0: ; %entry 2066; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] 2067; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2068; EG-NEXT: CF_END 2069; EG-NEXT: PAD 2070; EG-NEXT: ALU clause starting at 4: 2071; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X, 2072; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X, 2073; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W, 2074; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y, 2075; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W, 2076; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X, 2077; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W, 2078; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X, 2079; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y, 2080; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W, 2081; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W, 2082; EG-NEXT: ADD_INT T2.W, T2.Y, PS, 2083; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X, 2084; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z, 2085; EG-NEXT: ADDC_UINT T3.W, PS, PV.W, 2086; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z, 2087; EG-NEXT: ADD_INT T2.X, T2.X, PS, 2088; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W, 2089; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W, 2090; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212 2091; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W, 2092; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z, 2093; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z, 2094; EG-NEXT: ADD_INT T1.Z, PV.Y, PS, 2095; EG-NEXT: ADD_INT T0.W, PV.X, T0.W, 2096; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y, 2097; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W, 2098; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS, 2099; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y, 2100; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X, 2101; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X, 2102; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122 2103; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2104; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 2105; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2106; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 2107; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 2108; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z, 2109; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W, 2110; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2111; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2112; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W, 2113entry: 2114 %mul = mul i128 %a, %b 2115 store i128 %mul, ptr addrspace(1) %out 2116 ret void 2117} 2118 2119define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 2120; SI-LABEL: v_mul_i128: 2121; SI: ; %bb.0: ; %entry 2122; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2123; SI-NEXT: s_mov_b32 s7, 0xf000 2124; SI-NEXT: s_mov_b32 s6, 0 2125; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 2126; SI-NEXT: v_mov_b32_e32 v9, 0 2127; SI-NEXT: s_waitcnt lgkmcnt(0) 2128; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 2129; SI-NEXT: s_mov_b64 s[0:1], s[2:3] 2130; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2131; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 2132; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 2133; SI-NEXT: s_waitcnt vmcnt(0) 2134; SI-NEXT: v_mul_lo_u32 v3, v4, v3 2135; SI-NEXT: v_mul_hi_u32 v10, v4, v2 2136; SI-NEXT: v_mul_lo_u32 v12, v6, v1 2137; SI-NEXT: v_mul_hi_u32 v13, v6, v0 2138; SI-NEXT: v_mul_lo_u32 v17, v1, v4 2139; SI-NEXT: v_mul_hi_u32 v18, v0, v4 2140; SI-NEXT: v_mul_lo_u32 v11, v5, v2 2141; SI-NEXT: v_mul_lo_u32 v7, v7, v0 2142; SI-NEXT: v_mul_hi_u32 v16, v1, v4 2143; SI-NEXT: v_mul_lo_u32 v15, v0, v5 2144; SI-NEXT: v_mul_hi_u32 v14, v0, v5 2145; SI-NEXT: v_mul_hi_u32 v19, v1, v5 2146; SI-NEXT: v_mul_lo_u32 v5, v1, v5 2147; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 2148; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 2149; SI-NEXT: v_mul_lo_u32 v2, v4, v2 2150; SI-NEXT: v_mul_lo_u32 v6, v6, v0 2151; SI-NEXT: v_mul_lo_u32 v0, v0, v4 2152; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 2153; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc 2154; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 2155; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2156; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 2157; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc 2158; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 2159; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 2160; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 2161; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc 2162; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2163; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc 2164; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2165; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 2166; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 2167; SI-NEXT: s_endpgm 2168; 2169; VI-LABEL: v_mul_i128: 2170; VI: ; %bb.0: ; %entry 2171; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2172; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 2173; VI-NEXT: v_mov_b32_e32 v11, 0 2174; VI-NEXT: s_waitcnt lgkmcnt(0) 2175; VI-NEXT: v_mov_b32_e32 v1, s1 2176; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2177; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2178; VI-NEXT: v_mov_b32_e32 v3, s3 2179; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 2180; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc 2181; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2182; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] 2183; VI-NEXT: s_waitcnt vmcnt(0) 2184; VI-NEXT: v_mul_lo_u32 v10, v4, v3 2185; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 2186; VI-NEXT: v_mul_lo_u32 v14, v5, v2 2187; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 2188; VI-NEXT: v_mul_lo_u32 v15, v7, v0 2189; VI-NEXT: v_add_u32_e32 v7, vcc, v13, v10 2190; VI-NEXT: v_mov_b32_e32 v10, v3 2191; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] 2192; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v14 2193; VI-NEXT: v_mov_b32_e32 v7, v4 2194; VI-NEXT: v_mov_b32_e32 v4, v11 2195; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] 2196; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] 2197; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v13 2198; VI-NEXT: v_mov_b32_e32 v0, v4 2199; VI-NEXT: v_mul_lo_u32 v10, v6, v1 2200; VI-NEXT: v_add_u32_e32 v6, vcc, v7, v0 2201; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc 2202; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 2203; VI-NEXT: v_add_u32_e32 v5, vcc, v10, v11 2204; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 2205; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 2206; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] 2207; VI-NEXT: s_endpgm 2208; 2209; GFX9-LABEL: v_mul_i128: 2210; GFX9: ; %bb.0: ; %entry 2211; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2212; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 2213; GFX9-NEXT: v_mov_b32_e32 v10, 0 2214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2215; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] 2216; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] 2217; GFX9-NEXT: s_waitcnt vmcnt(0) 2218; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 2219; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 2220; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 2221; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] 2222; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 2223; GFX9-NEXT: v_mov_b32_e32 v4, v12 2224; GFX9-NEXT: v_mov_b32_e32 v12, v10 2225; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] 2226; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 2227; GFX9-NEXT: v_mul_lo_u32 v17, v7, v0 2228; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] 2229; GFX9-NEXT: v_mov_b32_e32 v0, v10 2230; GFX9-NEXT: v_mul_lo_u32 v16, v6, v1 2231; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v0 2232; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc 2233; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 2234; GFX9-NEXT: v_add3_u32 v3, v17, v3, v16 2235; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 2236; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc 2237; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] 2238; GFX9-NEXT: s_endpgm 2239; 2240; GFX10-LABEL: v_mul_i128: 2241; GFX10: ; %bb.0: ; %entry 2242; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2243; GFX10-NEXT: v_lshlrev_b32_e32 v14, 4, v0 2244; GFX10-NEXT: v_mov_b32_e32 v10, 0 2245; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2246; GFX10-NEXT: s_clause 0x1 2247; GFX10-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] 2248; GFX10-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] 2249; GFX10-NEXT: s_waitcnt vmcnt(0) 2250; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 2251; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 2252; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] 2253; GFX10-NEXT: v_mov_b32_e32 v9, v12 2254; GFX10-NEXT: v_mov_b32_e32 v12, v10 2255; GFX10-NEXT: v_mul_lo_u32 v10, v5, v2 2256; GFX10-NEXT: v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12] 2257; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 2258; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 2259; GFX10-NEXT: v_mov_b32_e32 v4, v13 2260; GFX10-NEXT: v_mul_lo_u32 v13, v6, v1 2261; GFX10-NEXT: v_add3_u32 v3, v3, v11, v10 2262; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v4 2263; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, 0, s0 2264; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] 2265; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10] 2266; GFX10-NEXT: v_mov_b32_e32 v9, v12 2267; GFX10-NEXT: v_add3_u32 v3, v7, v3, v13 2268; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 2269; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 2270; GFX10-NEXT: global_store_dwordx4 v14, v[8:11], s[2:3] 2271; GFX10-NEXT: s_endpgm 2272; 2273; GFX11-LABEL: v_mul_i128: 2274; GFX11: ; %bb.0: ; %entry 2275; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c 2276; GFX11-NEXT: v_lshlrev_b32_e32 v16, 4, v0 2277; GFX11-NEXT: v_mov_b32_e32 v10, 0 2278; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2279; GFX11-NEXT: s_clause 0x1 2280; GFX11-NEXT: global_load_b128 v[0:3], v16, s[0:1] 2281; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] 2282; GFX11-NEXT: s_waitcnt vmcnt(0) 2283; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 2284; GFX11-NEXT: v_mul_lo_u32 v15, v5, v2 2285; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 2286; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2287; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] 2288; GFX11-NEXT: v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10 2289; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 2290; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12] 2291; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v4, v2, 0 2292; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 2293; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 2294; GFX11-NEXT: v_mov_b32_e32 v2, v14 2295; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 2296; GFX11-NEXT: v_add3_u32 v11, v11, v3, v15 2297; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 2298; GFX11-NEXT: v_mov_b32_e32 v9, v13 2299; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 2300; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 2301; GFX11-NEXT: v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11] 2302; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] 2303; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2304; GFX11-NEXT: v_add3_u32 v0, v12, v15, v4 2305; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v14 2306; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2307; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo 2308; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] 2309; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2310; GFX11-NEXT: s_endpgm 2311; 2312; EG-LABEL: v_mul_i128: 2313; EG: ; %bb.0: ; %entry 2314; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 2315; EG-NEXT: TEX 1 @6 2316; EG-NEXT: ALU 41, @14, KC0[], KC1[] 2317; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2318; EG-NEXT: CF_END 2319; EG-NEXT: PAD 2320; EG-NEXT: Fetch clause starting at 6: 2321; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1 2322; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2323; EG-NEXT: ALU clause starting at 10: 2324; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2325; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 2326; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 2327; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 2328; EG-NEXT: ALU clause starting at 14: 2329; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y, 2330; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y, 2331; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X, 2332; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z, 2333; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X, 2334; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y, 2335; EG-NEXT: MULHI * T3.W, T2.Z, T0.X, 2336; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y, 2337; EG-NEXT: MULHI * T4.X, T2.X, T0.Z, 2338; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X, 2339; EG-NEXT: MULHI * T4.Y, T0.X, T2.X, 2340; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 2341; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y, 2342; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y, 2343; EG-NEXT: ADDC_UINT T5.W, PS, PV.W, 2344; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W, 2345; EG-NEXT: ADD_INT T4.X, T4.X, PS, 2346; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z, 2347; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W, 2348; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z, 2349; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X, 2350; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z, 2351; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z, 2352; EG-NEXT: ADD_INT T2.Z, PV.Y, PS, 2353; EG-NEXT: ADD_INT T0.W, PV.X, T3.X, 2354; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z, 2355; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W, 2356; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS, 2357; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y, 2358; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X, 2359; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X, 2360; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y, 2361; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2362; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 2363; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2364; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 2365; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 2366; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z, 2367; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W, 2368; EG-NEXT: LSHR T1.X, T1.X, literal.x, 2369; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X, 2370; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2371entry: 2372 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2373 %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid 2374 %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 2375 %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 2376 %a = load i128, ptr addrspace(1) %gep.a 2377 %b = load i128, ptr addrspace(1) %gep.b 2378 %mul = mul i128 %a, %b 2379 store i128 %mul, ptr addrspace(1) %gep.out 2380 ret void 2381} 2382 2383declare i32 @llvm.amdgcn.workitem.id.x() #1 2384 2385attributes #0 = { nounwind } 2386attributes #1 = { nounwind readnone} 2387