1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s 9 10; mul24 and mad24 are affected 11 12define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 13; SI-LABEL: test_mul_v2i32: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 16; SI-NEXT: s_mov_b32 s7, 0xf000 17; SI-NEXT: s_mov_b32 s6, -1 18; SI-NEXT: s_mov_b32 s10, s6 19; SI-NEXT: s_mov_b32 s11, s7 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: s_mov_b32 s8, s2 22; SI-NEXT: s_mov_b32 s9, s3 23; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 24; SI-NEXT: s_mov_b32 s4, s0 25; SI-NEXT: s_mov_b32 s5, s1 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: v_mul_lo_u32 v1, v1, v3 28; SI-NEXT: v_mul_lo_u32 v0, v0, v2 29; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 30; SI-NEXT: s_endpgm 31; 32; VI-LABEL: test_mul_v2i32: 33; VI: ; %bb.0: ; %entry 34; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 35; VI-NEXT: s_mov_b32 s7, 0xf000 36; VI-NEXT: s_mov_b32 s6, -1 37; VI-NEXT: s_mov_b32 s10, s6 38; VI-NEXT: s_mov_b32 s11, s7 39; VI-NEXT: s_waitcnt lgkmcnt(0) 40; VI-NEXT: s_mov_b32 s8, s2 41; VI-NEXT: s_mov_b32 s9, s3 42; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 43; VI-NEXT: s_mov_b32 s4, s0 44; VI-NEXT: s_mov_b32 s5, s1 45; VI-NEXT: s_waitcnt vmcnt(0) 46; VI-NEXT: v_mul_lo_u32 v1, v1, v3 47; VI-NEXT: v_mul_lo_u32 v0, v0, v2 48; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 49; VI-NEXT: s_endpgm 50; 51; GFX9-LABEL: test_mul_v2i32: 52; GFX9: ; %bb.0: ; %entry 53; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 54; GFX9-NEXT: s_mov_b32 s7, 0xf000 55; GFX9-NEXT: s_mov_b32 s6, -1 56; GFX9-NEXT: s_mov_b32 s10, s6 57; GFX9-NEXT: s_mov_b32 s11, s7 58; GFX9-NEXT: s_waitcnt lgkmcnt(0) 59; GFX9-NEXT: s_mov_b32 s8, s2 60; GFX9-NEXT: s_mov_b32 s9, s3 61; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 62; GFX9-NEXT: s_mov_b32 s4, s0 63; GFX9-NEXT: s_mov_b32 s5, s1 64; GFX9-NEXT: s_waitcnt vmcnt(0) 65; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 66; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 67; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 68; GFX9-NEXT: s_endpgm 69; 70; GFX10-LABEL: test_mul_v2i32: 71; GFX10: ; %bb.0: ; %entry 72; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 73; GFX10-NEXT: s_mov_b32 s6, -1 74; GFX10-NEXT: s_mov_b32 s7, 0x31016000 75; GFX10-NEXT: s_mov_b32 s10, s6 76; GFX10-NEXT: s_mov_b32 s11, s7 77; GFX10-NEXT: s_waitcnt lgkmcnt(0) 78; GFX10-NEXT: s_mov_b32 s8, s2 79; GFX10-NEXT: s_mov_b32 s9, s3 80; GFX10-NEXT: s_mov_b32 s4, s0 81; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 82; GFX10-NEXT: s_mov_b32 s5, s1 83; GFX10-NEXT: s_waitcnt vmcnt(0) 84; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 85; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 86; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 87; GFX10-NEXT: s_endpgm 88; 89; GFX11-LABEL: test_mul_v2i32: 90; GFX11: ; %bb.0: ; %entry 91; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 92; GFX11-NEXT: s_mov_b32 s6, -1 93; GFX11-NEXT: s_mov_b32 s7, 0x31016000 94; GFX11-NEXT: s_mov_b32 s10, s6 95; GFX11-NEXT: s_mov_b32 s11, s7 96; GFX11-NEXT: s_waitcnt lgkmcnt(0) 97; GFX11-NEXT: s_mov_b32 s8, s2 98; GFX11-NEXT: s_mov_b32 s9, s3 99; GFX11-NEXT: s_mov_b32 s4, s0 100; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 101; GFX11-NEXT: s_mov_b32 s5, s1 102; GFX11-NEXT: s_waitcnt vmcnt(0) 103; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 104; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 105; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 106; GFX11-NEXT: s_nop 0 107; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 108; GFX11-NEXT: s_endpgm 109; 110; GFX12-LABEL: test_mul_v2i32: 111; GFX12: ; %bb.0: ; %entry 112; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 113; GFX12-NEXT: s_mov_b32 s6, -1 114; GFX12-NEXT: s_mov_b32 s7, 0x31016000 115; GFX12-NEXT: s_mov_b32 s10, s6 116; GFX12-NEXT: s_mov_b32 s11, s7 117; GFX12-NEXT: s_wait_kmcnt 0x0 118; GFX12-NEXT: s_mov_b32 s8, s2 119; GFX12-NEXT: s_mov_b32 s9, s3 120; GFX12-NEXT: s_mov_b32 s4, s0 121; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null 122; GFX12-NEXT: s_mov_b32 s5, s1 123; GFX12-NEXT: s_wait_loadcnt 0x0 124; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 125; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 126; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 127; GFX12-NEXT: s_nop 0 128; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 129; GFX12-NEXT: s_endpgm 130; 131; EG-LABEL: test_mul_v2i32: 132; EG: ; %bb.0: ; %entry 133; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 134; EG-NEXT: TEX 0 @6 135; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 136; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 137; EG-NEXT: CF_END 138; EG-NEXT: PAD 139; EG-NEXT: Fetch clause starting at 6: 140; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 141; EG-NEXT: ALU clause starting at 8: 142; EG-NEXT: MOV * T0.X, KC0[2].Z, 143; EG-NEXT: ALU clause starting at 9: 144; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W, 145; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 146; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z, 147; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 148entry: 149 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 150 %a = load <2 x i32>, ptr addrspace(1) %in 151 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 152 %result = mul <2 x i32> %a, %b 153 store <2 x i32> %result, ptr addrspace(1) %out 154 ret void 155} 156 157define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 158; SI-LABEL: v_mul_v4i32: 159; SI: ; %bb.0: ; %entry 160; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 161; SI-NEXT: s_mov_b32 s7, 0xf000 162; SI-NEXT: s_mov_b32 s6, -1 163; SI-NEXT: s_mov_b32 s10, s6 164; SI-NEXT: s_mov_b32 s11, s7 165; SI-NEXT: s_waitcnt lgkmcnt(0) 166; SI-NEXT: s_mov_b32 s8, s2 167; SI-NEXT: s_mov_b32 s9, s3 168; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 169; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 170; SI-NEXT: s_mov_b32 s4, s0 171; SI-NEXT: s_mov_b32 s5, s1 172; SI-NEXT: s_waitcnt vmcnt(0) 173; SI-NEXT: v_mul_lo_u32 v3, v3, v7 174; SI-NEXT: v_mul_lo_u32 v2, v2, v6 175; SI-NEXT: v_mul_lo_u32 v1, v1, v5 176; SI-NEXT: v_mul_lo_u32 v0, v0, v4 177; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 178; SI-NEXT: s_endpgm 179; 180; VI-LABEL: v_mul_v4i32: 181; VI: ; %bb.0: ; %entry 182; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 183; VI-NEXT: s_mov_b32 s7, 0xf000 184; VI-NEXT: s_mov_b32 s6, -1 185; VI-NEXT: s_mov_b32 s10, s6 186; VI-NEXT: s_mov_b32 s11, s7 187; VI-NEXT: s_waitcnt lgkmcnt(0) 188; VI-NEXT: s_mov_b32 s8, s2 189; VI-NEXT: s_mov_b32 s9, s3 190; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 191; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 192; VI-NEXT: s_mov_b32 s4, s0 193; VI-NEXT: s_mov_b32 s5, s1 194; VI-NEXT: s_waitcnt vmcnt(0) 195; VI-NEXT: v_mul_lo_u32 v3, v3, v7 196; VI-NEXT: v_mul_lo_u32 v2, v2, v6 197; VI-NEXT: v_mul_lo_u32 v1, v1, v5 198; VI-NEXT: v_mul_lo_u32 v0, v0, v4 199; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 200; VI-NEXT: s_endpgm 201; 202; GFX9-LABEL: v_mul_v4i32: 203; GFX9: ; %bb.0: ; %entry 204; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 205; GFX9-NEXT: s_mov_b32 s7, 0xf000 206; GFX9-NEXT: s_mov_b32 s6, -1 207; GFX9-NEXT: s_mov_b32 s10, s6 208; GFX9-NEXT: s_mov_b32 s11, s7 209; GFX9-NEXT: s_waitcnt lgkmcnt(0) 210; GFX9-NEXT: s_mov_b32 s8, s2 211; GFX9-NEXT: s_mov_b32 s9, s3 212; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 213; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 214; GFX9-NEXT: s_mov_b32 s4, s0 215; GFX9-NEXT: s_mov_b32 s5, s1 216; GFX9-NEXT: s_waitcnt vmcnt(0) 217; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 218; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 219; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 220; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 221; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 222; GFX9-NEXT: s_endpgm 223; 224; GFX10-LABEL: v_mul_v4i32: 225; GFX10: ; %bb.0: ; %entry 226; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 227; GFX10-NEXT: s_mov_b32 s6, -1 228; GFX10-NEXT: s_mov_b32 s7, 0x31016000 229; GFX10-NEXT: s_mov_b32 s10, s6 230; GFX10-NEXT: s_mov_b32 s11, s7 231; GFX10-NEXT: s_waitcnt lgkmcnt(0) 232; GFX10-NEXT: s_mov_b32 s8, s2 233; GFX10-NEXT: s_mov_b32 s9, s3 234; GFX10-NEXT: s_clause 0x1 235; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 236; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 237; GFX10-NEXT: s_mov_b32 s4, s0 238; GFX10-NEXT: s_mov_b32 s5, s1 239; GFX10-NEXT: s_waitcnt vmcnt(0) 240; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 241; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 242; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 243; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 244; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 245; GFX10-NEXT: s_endpgm 246; 247; GFX11-LABEL: v_mul_v4i32: 248; GFX11: ; %bb.0: ; %entry 249; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 250; GFX11-NEXT: s_mov_b32 s6, -1 251; GFX11-NEXT: s_mov_b32 s7, 0x31016000 252; GFX11-NEXT: s_mov_b32 s10, s6 253; GFX11-NEXT: s_mov_b32 s11, s7 254; GFX11-NEXT: s_waitcnt lgkmcnt(0) 255; GFX11-NEXT: s_mov_b32 s8, s2 256; GFX11-NEXT: s_mov_b32 s9, s3 257; GFX11-NEXT: s_clause 0x1 258; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 259; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16 260; GFX11-NEXT: s_mov_b32 s4, s0 261; GFX11-NEXT: s_mov_b32 s5, s1 262; GFX11-NEXT: s_waitcnt vmcnt(0) 263; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7 264; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6 265; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 266; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 267; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 268; GFX11-NEXT: s_nop 0 269; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 270; GFX11-NEXT: s_endpgm 271; 272; GFX12-LABEL: v_mul_v4i32: 273; GFX12: ; %bb.0: ; %entry 274; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 275; GFX12-NEXT: s_mov_b32 s6, -1 276; GFX12-NEXT: s_mov_b32 s7, 0x31016000 277; GFX12-NEXT: s_mov_b32 s10, s6 278; GFX12-NEXT: s_mov_b32 s11, s7 279; GFX12-NEXT: s_wait_kmcnt 0x0 280; GFX12-NEXT: s_mov_b32 s8, s2 281; GFX12-NEXT: s_mov_b32 s9, s3 282; GFX12-NEXT: s_clause 0x1 283; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null 284; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 285; GFX12-NEXT: s_mov_b32 s4, s0 286; GFX12-NEXT: s_mov_b32 s5, s1 287; GFX12-NEXT: s_wait_loadcnt 0x0 288; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7 289; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6 290; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 291; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4 292; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null 293; GFX12-NEXT: s_nop 0 294; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 295; GFX12-NEXT: s_endpgm 296; 297; EG-LABEL: v_mul_v4i32: 298; EG: ; %bb.0: ; %entry 299; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 300; EG-NEXT: TEX 1 @6 301; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 302; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 303; EG-NEXT: CF_END 304; EG-NEXT: PAD 305; EG-NEXT: Fetch clause starting at 6: 306; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 307; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 308; EG-NEXT: ALU clause starting at 10: 309; EG-NEXT: MOV * T0.X, KC0[2].Z, 310; EG-NEXT: ALU clause starting at 11: 311; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W, 312; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z, 313; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y, 314; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 315; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 316; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 317entry: 318 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 319 %a = load <4 x i32>, ptr addrspace(1) %in 320 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 321 %result = mul <4 x i32> %a, %b 322 store <4 x i32> %result, ptr addrspace(1) %out 323 ret void 324} 325 326define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { 327; SI-LABEL: s_trunc_i64_mul_to_i32: 328; SI: ; %bb.0: ; %entry 329; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 330; SI-NEXT: s_waitcnt lgkmcnt(0) 331; SI-NEXT: s_load_dword s7, s[0:1], 0xd 332; SI-NEXT: s_mov_b32 s3, 0xf000 333; SI-NEXT: s_mov_b32 s2, -1 334; SI-NEXT: s_mov_b32 s0, s4 335; SI-NEXT: s_waitcnt lgkmcnt(0) 336; SI-NEXT: s_mul_i32 s4, s7, s6 337; SI-NEXT: s_mov_b32 s1, s5 338; SI-NEXT: v_mov_b32_e32 v0, s4 339; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 340; SI-NEXT: s_endpgm 341; 342; VI-LABEL: s_trunc_i64_mul_to_i32: 343; VI: ; %bb.0: ; %entry 344; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 345; VI-NEXT: s_waitcnt lgkmcnt(0) 346; VI-NEXT: s_load_dword s7, s[0:1], 0x34 347; VI-NEXT: s_mov_b32 s3, 0xf000 348; VI-NEXT: s_mov_b32 s2, -1 349; VI-NEXT: s_mov_b32 s0, s4 350; VI-NEXT: s_waitcnt lgkmcnt(0) 351; VI-NEXT: s_mul_i32 s4, s7, s6 352; VI-NEXT: s_mov_b32 s1, s5 353; VI-NEXT: v_mov_b32_e32 v0, s4 354; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 355; VI-NEXT: s_endpgm 356; 357; GFX9-LABEL: s_trunc_i64_mul_to_i32: 358; GFX9: ; %bb.0: ; %entry 359; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 360; GFX9-NEXT: s_waitcnt lgkmcnt(0) 361; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 362; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 363; GFX9-NEXT: s_mov_b32 s3, 0xf000 364; GFX9-NEXT: s_mov_b32 s2, -1 365; GFX9-NEXT: s_mov_b32 s0, s4 366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 367; GFX9-NEXT: s_mul_i32 s4, s7, s6 368; GFX9-NEXT: s_mov_b32 s1, s5 369; GFX9-NEXT: v_mov_b32_e32 v0, s4 370; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 371; GFX9-NEXT: s_endpgm 372; 373; GFX10-LABEL: s_trunc_i64_mul_to_i32: 374; GFX10: ; %bb.0: ; %entry 375; GFX10-NEXT: s_clause 0x1 376; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 377; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 378; GFX10-NEXT: s_waitcnt lgkmcnt(0) 379; GFX10-NEXT: s_mov_b32 s7, 0x31016000 380; GFX10-NEXT: s_mul_i32 s0, s2, s6 381; GFX10-NEXT: s_mov_b32 s6, -1 382; GFX10-NEXT: v_mov_b32_e32 v0, s0 383; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 384; GFX10-NEXT: s_endpgm 385; 386; GFX11-LABEL: s_trunc_i64_mul_to_i32: 387; GFX11: ; %bb.0: ; %entry 388; GFX11-NEXT: s_clause 0x1 389; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 390; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 391; GFX11-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-NEXT: s_mov_b32 s7, 0x31016000 393; GFX11-NEXT: s_mul_i32 s0, s0, s6 394; GFX11-NEXT: s_mov_b32 s6, -1 395; GFX11-NEXT: v_mov_b32_e32 v0, s0 396; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 397; GFX11-NEXT: s_nop 0 398; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 399; GFX11-NEXT: s_endpgm 400; 401; GFX12-LABEL: s_trunc_i64_mul_to_i32: 402; GFX12: ; %bb.0: ; %entry 403; GFX12-NEXT: s_clause 0x1 404; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 405; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x34 406; GFX12-NEXT: s_wait_kmcnt 0x0 407; GFX12-NEXT: s_mov_b32 s7, 0x31016000 408; GFX12-NEXT: s_mul_i32 s0, s0, s6 409; GFX12-NEXT: s_mov_b32 s6, -1 410; GFX12-NEXT: v_mov_b32_e32 v0, s0 411; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null 412; GFX12-NEXT: s_nop 0 413; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 414; GFX12-NEXT: s_endpgm 415; 416; EG-LABEL: s_trunc_i64_mul_to_i32: 417; EG: ; %bb.0: ; %entry 418; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 419; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 420; EG-NEXT: CF_END 421; EG-NEXT: PAD 422; EG-NEXT: ALU clause starting at 4: 423; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 424; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 425; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W, 426entry: 427 %mul = mul i64 %b, %a 428 %trunc = trunc i64 %mul to i32 429 store i32 %trunc, ptr addrspace(1) %out, align 8 430 ret void 431} 432 433define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 434; SI-LABEL: v_trunc_i64_mul_to_i32: 435; SI: ; %bb.0: ; %entry 436; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 437; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 438; SI-NEXT: s_mov_b32 s3, 0xf000 439; SI-NEXT: s_mov_b32 s2, -1 440; SI-NEXT: s_mov_b32 s14, s2 441; SI-NEXT: s_waitcnt lgkmcnt(0) 442; SI-NEXT: s_mov_b32 s12, s6 443; SI-NEXT: s_mov_b32 s13, s7 444; SI-NEXT: s_mov_b32 s15, s3 445; SI-NEXT: s_mov_b32 s10, s2 446; SI-NEXT: s_mov_b32 s11, s3 447; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 448; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 449; SI-NEXT: s_mov_b32 s0, s4 450; SI-NEXT: s_mov_b32 s1, s5 451; SI-NEXT: s_waitcnt vmcnt(0) 452; SI-NEXT: v_mul_lo_u32 v0, v1, v0 453; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 454; SI-NEXT: s_endpgm 455; 456; VI-LABEL: v_trunc_i64_mul_to_i32: 457; VI: ; %bb.0: ; %entry 458; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 459; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 460; VI-NEXT: s_mov_b32 s3, 0xf000 461; VI-NEXT: s_mov_b32 s2, -1 462; VI-NEXT: s_mov_b32 s14, s2 463; VI-NEXT: s_waitcnt lgkmcnt(0) 464; VI-NEXT: s_mov_b32 s12, s6 465; VI-NEXT: s_mov_b32 s13, s7 466; VI-NEXT: s_mov_b32 s15, s3 467; VI-NEXT: s_mov_b32 s10, s2 468; VI-NEXT: s_mov_b32 s11, s3 469; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 470; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 471; VI-NEXT: s_mov_b32 s0, s4 472; VI-NEXT: s_mov_b32 s1, s5 473; VI-NEXT: s_waitcnt vmcnt(0) 474; VI-NEXT: v_mul_lo_u32 v0, v1, v0 475; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 476; VI-NEXT: s_endpgm 477; 478; GFX9-LABEL: v_trunc_i64_mul_to_i32: 479; GFX9: ; %bb.0: ; %entry 480; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 481; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 482; GFX9-NEXT: s_mov_b32 s3, 0xf000 483; GFX9-NEXT: s_mov_b32 s2, -1 484; GFX9-NEXT: s_mov_b32 s14, s2 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: s_mov_b32 s12, s6 487; GFX9-NEXT: s_mov_b32 s13, s7 488; GFX9-NEXT: s_mov_b32 s15, s3 489; GFX9-NEXT: s_mov_b32 s10, s2 490; GFX9-NEXT: s_mov_b32 s11, s3 491; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 492; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 493; GFX9-NEXT: s_mov_b32 s0, s4 494; GFX9-NEXT: s_mov_b32 s1, s5 495; GFX9-NEXT: s_waitcnt vmcnt(0) 496; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 497; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 498; GFX9-NEXT: s_endpgm 499; 500; GFX10-LABEL: v_trunc_i64_mul_to_i32: 501; GFX10: ; %bb.0: ; %entry 502; GFX10-NEXT: s_clause 0x1 503; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 504; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 505; GFX10-NEXT: s_mov_b32 s2, -1 506; GFX10-NEXT: s_mov_b32 s3, 0x31016000 507; GFX10-NEXT: s_mov_b32 s14, s2 508; GFX10-NEXT: s_mov_b32 s15, s3 509; GFX10-NEXT: s_mov_b32 s10, s2 510; GFX10-NEXT: s_mov_b32 s11, s3 511; GFX10-NEXT: s_waitcnt lgkmcnt(0) 512; GFX10-NEXT: s_mov_b32 s12, s6 513; GFX10-NEXT: s_mov_b32 s13, s7 514; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 515; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 516; GFX10-NEXT: s_mov_b32 s0, s4 517; GFX10-NEXT: s_mov_b32 s1, s5 518; GFX10-NEXT: s_waitcnt vmcnt(0) 519; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 520; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 521; GFX10-NEXT: s_endpgm 522; 523; GFX11-LABEL: v_trunc_i64_mul_to_i32: 524; GFX11: ; %bb.0: ; %entry 525; GFX11-NEXT: s_clause 0x1 526; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 527; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 528; GFX11-NEXT: s_mov_b32 s10, -1 529; GFX11-NEXT: s_mov_b32 s11, 0x31016000 530; GFX11-NEXT: s_mov_b32 s14, s10 531; GFX11-NEXT: s_mov_b32 s15, s11 532; GFX11-NEXT: s_mov_b32 s2, s10 533; GFX11-NEXT: s_mov_b32 s3, s11 534; GFX11-NEXT: s_waitcnt lgkmcnt(0) 535; GFX11-NEXT: s_mov_b32 s12, s6 536; GFX11-NEXT: s_mov_b32 s13, s7 537; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 538; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 539; GFX11-NEXT: s_mov_b32 s8, s4 540; GFX11-NEXT: s_mov_b32 s9, s5 541; GFX11-NEXT: s_waitcnt vmcnt(0) 542; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 543; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 544; GFX11-NEXT: s_nop 0 545; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 546; GFX11-NEXT: s_endpgm 547; 548; GFX12-LABEL: v_trunc_i64_mul_to_i32: 549; GFX12: ; %bb.0: ; %entry 550; GFX12-NEXT: s_clause 0x1 551; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 552; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 553; GFX12-NEXT: s_mov_b32 s10, -1 554; GFX12-NEXT: s_mov_b32 s11, 0x31016000 555; GFX12-NEXT: s_mov_b32 s14, s10 556; GFX12-NEXT: s_mov_b32 s15, s11 557; GFX12-NEXT: s_mov_b32 s2, s10 558; GFX12-NEXT: s_mov_b32 s3, s11 559; GFX12-NEXT: s_wait_kmcnt 0x0 560; GFX12-NEXT: s_mov_b32 s12, s6 561; GFX12-NEXT: s_mov_b32 s13, s7 562; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null 563; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null 564; GFX12-NEXT: s_mov_b32 s8, s4 565; GFX12-NEXT: s_mov_b32 s9, s5 566; GFX12-NEXT: s_wait_loadcnt 0x0 567; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 568; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 569; GFX12-NEXT: s_nop 0 570; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 571; GFX12-NEXT: s_endpgm 572; 573; EG-LABEL: v_trunc_i64_mul_to_i32: 574; EG: ; %bb.0: ; %entry 575; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 576; EG-NEXT: TEX 1 @6 577; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 578; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 579; EG-NEXT: CF_END 580; EG-NEXT: PAD 581; EG-NEXT: Fetch clause starting at 6: 582; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 583; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 584; EG-NEXT: ALU clause starting at 10: 585; EG-NEXT: MOV T0.X, KC0[2].Z, 586; EG-NEXT: MOV * T1.X, KC0[2].W, 587; EG-NEXT: ALU clause starting at 12: 588; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 589; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 590; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 591entry: 592 %a = load i64, ptr addrspace(1) %aptr, align 8 593 %b = load i64, ptr addrspace(1) %bptr, align 8 594 %mul = mul i64 %b, %a 595 %trunc = trunc i64 %mul to i32 596 store i32 %trunc, ptr addrspace(1) %out, align 8 597 ret void 598} 599 600; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top 601; 32-bits of both arguments are sign bits. 602 603define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { 604; SI-LABEL: mul64_sext_c: 605; SI: ; %bb.0: ; %entry 606; SI-NEXT: s_load_dword s4, s[0:1], 0xb 607; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 608; SI-NEXT: v_mov_b32_e32 v0, 0x50 609; SI-NEXT: s_mov_b32 s3, 0xf000 610; SI-NEXT: s_mov_b32 s2, -1 611; SI-NEXT: s_waitcnt lgkmcnt(0) 612; SI-NEXT: v_mul_hi_i32 v1, s4, v0 613; SI-NEXT: s_mulk_i32 s4, 0x50 614; SI-NEXT: v_mov_b32_e32 v0, s4 615; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 616; SI-NEXT: s_endpgm 617; 618; VI-LABEL: mul64_sext_c: 619; VI: ; %bb.0: ; %entry 620; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 621; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 622; VI-NEXT: v_mov_b32_e32 v0, 0x50 623; VI-NEXT: s_waitcnt lgkmcnt(0) 624; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 625; VI-NEXT: s_mov_b32 s3, 0xf000 626; VI-NEXT: s_mov_b32 s2, -1 627; VI-NEXT: s_nop 2 628; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 629; VI-NEXT: s_endpgm 630; 631; GFX9-LABEL: mul64_sext_c: 632; GFX9: ; %bb.0: ; %entry 633; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 634; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 635; GFX9-NEXT: s_mov_b32 s7, 0xf000 636; GFX9-NEXT: s_mov_b32 s6, -1 637; GFX9-NEXT: s_waitcnt lgkmcnt(0) 638; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 639; GFX9-NEXT: s_mulk_i32 s2, 0x50 640; GFX9-NEXT: v_mov_b32_e32 v0, s2 641; GFX9-NEXT: v_mov_b32_e32 v1, s0 642; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 643; GFX9-NEXT: s_endpgm 644; 645; GFX10-LABEL: mul64_sext_c: 646; GFX10: ; %bb.0: ; %entry 647; GFX10-NEXT: s_clause 0x1 648; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 649; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 650; GFX10-NEXT: s_mov_b32 s7, 0x31016000 651; GFX10-NEXT: s_mov_b32 s6, -1 652; GFX10-NEXT: s_waitcnt lgkmcnt(0) 653; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 654; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 655; GFX10-NEXT: v_mov_b32_e32 v0, s0 656; GFX10-NEXT: v_mov_b32_e32 v1, s1 657; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 658; GFX10-NEXT: s_endpgm 659; 660; GFX11-LABEL: mul64_sext_c: 661; GFX11: ; %bb.0: ; %entry 662; GFX11-NEXT: s_clause 0x1 663; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c 664; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 665; GFX11-NEXT: s_waitcnt lgkmcnt(0) 666; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 667; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 668; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 669; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 670; GFX11-NEXT: s_mov_b32 s3, 0x31016000 671; GFX11-NEXT: s_mov_b32 s2, -1 672; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 673; GFX11-NEXT: s_nop 0 674; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 675; GFX11-NEXT: s_endpgm 676; 677; GFX12-LABEL: mul64_sext_c: 678; GFX12: ; %bb.0: ; %entry 679; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 680; GFX12-NEXT: s_wait_kmcnt 0x0 681; GFX12-NEXT: s_ashr_i32 s3, s2, 31 682; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 683; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 684; GFX12-NEXT: s_mov_b32 s3, 0x31016000 685; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 686; GFX12-NEXT: s_mov_b32 s2, -1 687; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 688; GFX12-NEXT: s_nop 0 689; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 690; GFX12-NEXT: s_endpgm 691; 692; EG-LABEL: mul64_sext_c: 693; EG: ; %bb.0: ; %entry 694; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 695; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 696; EG-NEXT: CF_END 697; EG-NEXT: PAD 698; EG-NEXT: ALU clause starting at 4: 699; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x, 700; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 701; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 702; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 703; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 704entry: 705 %0 = sext i32 %in to i64 706 %1 = mul i64 %0, 80 707 store i64 %1, ptr addrspace(1) %out 708 ret void 709} 710 711define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { 712; SI-LABEL: mul64_zext_c: 713; SI: ; %bb.0: ; %entry 714; SI-NEXT: s_load_dword s4, s[0:1], 0xb 715; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 716; SI-NEXT: v_mov_b32_e32 v0, 0x50 717; SI-NEXT: s_mov_b32 s3, 0xf000 718; SI-NEXT: s_mov_b32 s2, -1 719; SI-NEXT: s_waitcnt lgkmcnt(0) 720; SI-NEXT: v_mul_hi_u32 v1, s4, v0 721; SI-NEXT: s_mulk_i32 s4, 0x50 722; SI-NEXT: v_mov_b32_e32 v0, s4 723; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 724; SI-NEXT: s_endpgm 725; 726; VI-LABEL: mul64_zext_c: 727; VI: ; %bb.0: ; %entry 728; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 729; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 730; VI-NEXT: v_mov_b32_e32 v0, 0x50 731; VI-NEXT: s_waitcnt lgkmcnt(0) 732; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 733; VI-NEXT: s_mov_b32 s3, 0xf000 734; VI-NEXT: s_mov_b32 s2, -1 735; VI-NEXT: s_nop 2 736; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 737; VI-NEXT: s_endpgm 738; 739; GFX9-LABEL: mul64_zext_c: 740; GFX9: ; %bb.0: ; %entry 741; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 742; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 743; GFX9-NEXT: s_mov_b32 s7, 0xf000 744; GFX9-NEXT: s_mov_b32 s6, -1 745; GFX9-NEXT: s_waitcnt lgkmcnt(0) 746; GFX9-NEXT: s_mul_hi_u32 s0, s2, 0x50 747; GFX9-NEXT: s_mulk_i32 s2, 0x50 748; GFX9-NEXT: v_mov_b32_e32 v0, s2 749; GFX9-NEXT: v_mov_b32_e32 v1, s0 750; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 751; GFX9-NEXT: s_endpgm 752; 753; GFX10-LABEL: mul64_zext_c: 754; GFX10: ; %bb.0: ; %entry 755; GFX10-NEXT: s_clause 0x1 756; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 757; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 758; GFX10-NEXT: s_mov_b32 s7, 0x31016000 759; GFX10-NEXT: s_mov_b32 s6, -1 760; GFX10-NEXT: s_waitcnt lgkmcnt(0) 761; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 762; GFX10-NEXT: s_mul_hi_u32 s1, s2, 0x50 763; GFX10-NEXT: v_mov_b32_e32 v0, s0 764; GFX10-NEXT: v_mov_b32_e32 v1, s1 765; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 766; GFX10-NEXT: s_endpgm 767; 768; GFX11-LABEL: mul64_zext_c: 769; GFX11: ; %bb.0: ; %entry 770; GFX11-NEXT: s_clause 0x1 771; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c 772; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 773; GFX11-NEXT: s_waitcnt lgkmcnt(0) 774; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 775; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 776; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 777; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 778; GFX11-NEXT: s_mov_b32 s3, 0x31016000 779; GFX11-NEXT: s_mov_b32 s2, -1 780; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 781; GFX11-NEXT: s_nop 0 782; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 783; GFX11-NEXT: s_endpgm 784; 785; GFX12-LABEL: mul64_zext_c: 786; GFX12: ; %bb.0: ; %entry 787; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 788; GFX12-NEXT: s_mov_b32 s3, 0 789; GFX12-NEXT: s_wait_kmcnt 0x0 790; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 791; GFX12-NEXT: s_mov_b32 s3, 0x31016000 792; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 793; GFX12-NEXT: s_mov_b32 s2, -1 794; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 795; GFX12-NEXT: s_nop 0 796; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 797; GFX12-NEXT: s_endpgm 798; 799; EG-LABEL: mul64_zext_c: 800; EG: ; %bb.0: ; %entry 801; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 802; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 803; EG-NEXT: CF_END 804; EG-NEXT: PAD 805; EG-NEXT: ALU clause starting at 4: 806; EG-NEXT: MULHI * T0.Y, KC0[2].Z, literal.x, 807; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 808; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 809; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 810; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 811entry: 812 %0 = zext i32 %in to i64 813 %1 = mul i64 %0, 80 814 store i64 %1, ptr addrspace(1) %out 815 ret void 816} 817 818define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 819; SI-LABEL: v_mul64_sext_c: 820; SI: ; %bb.0: ; %entry 821; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 822; SI-NEXT: s_mov_b32 s7, 0xf000 823; SI-NEXT: s_mov_b32 s6, -1 824; SI-NEXT: s_mov_b32 s10, s6 825; SI-NEXT: s_mov_b32 s11, s7 826; SI-NEXT: s_waitcnt lgkmcnt(0) 827; SI-NEXT: s_mov_b32 s8, s2 828; SI-NEXT: s_mov_b32 s9, s3 829; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 830; SI-NEXT: s_movk_i32 s2, 0x50 831; SI-NEXT: s_mov_b32 s4, s0 832; SI-NEXT: s_mov_b32 s5, s1 833; SI-NEXT: s_waitcnt vmcnt(0) 834; SI-NEXT: v_mul_hi_i32 v1, v0, s2 835; SI-NEXT: v_mul_lo_u32 v0, v0, s2 836; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 837; SI-NEXT: s_endpgm 838; 839; VI-LABEL: v_mul64_sext_c: 840; VI: ; %bb.0: ; %entry 841; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 842; VI-NEXT: s_mov_b32 s7, 0xf000 843; VI-NEXT: s_mov_b32 s6, -1 844; VI-NEXT: s_mov_b32 s10, s6 845; VI-NEXT: s_mov_b32 s11, s7 846; VI-NEXT: s_waitcnt lgkmcnt(0) 847; VI-NEXT: s_mov_b32 s8, s2 848; VI-NEXT: s_mov_b32 s9, s3 849; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 850; VI-NEXT: s_movk_i32 s2, 0x50 851; VI-NEXT: s_mov_b32 s4, s0 852; VI-NEXT: s_mov_b32 s5, s1 853; VI-NEXT: s_waitcnt vmcnt(0) 854; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0 855; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 856; VI-NEXT: s_endpgm 857; 858; GFX9-LABEL: v_mul64_sext_c: 859; GFX9: ; %bb.0: ; %entry 860; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 861; GFX9-NEXT: s_mov_b32 s7, 0xf000 862; GFX9-NEXT: s_mov_b32 s6, -1 863; GFX9-NEXT: s_mov_b32 s10, s6 864; GFX9-NEXT: s_mov_b32 s11, s7 865; GFX9-NEXT: s_waitcnt lgkmcnt(0) 866; GFX9-NEXT: s_mov_b32 s8, s2 867; GFX9-NEXT: s_mov_b32 s9, s3 868; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 869; GFX9-NEXT: s_movk_i32 s2, 0x50 870; GFX9-NEXT: s_mov_b32 s4, s0 871; GFX9-NEXT: s_mov_b32 s5, s1 872; GFX9-NEXT: s_waitcnt vmcnt(0) 873; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 874; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 875; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 876; GFX9-NEXT: s_endpgm 877; 878; GFX10-LABEL: v_mul64_sext_c: 879; GFX10: ; %bb.0: ; %entry 880; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 881; GFX10-NEXT: s_mov_b32 s6, -1 882; GFX10-NEXT: s_mov_b32 s7, 0x31016000 883; GFX10-NEXT: s_mov_b32 s10, s6 884; GFX10-NEXT: s_mov_b32 s11, s7 885; GFX10-NEXT: s_waitcnt lgkmcnt(0) 886; GFX10-NEXT: s_mov_b32 s8, s2 887; GFX10-NEXT: s_mov_b32 s9, s3 888; GFX10-NEXT: s_mov_b32 s4, s0 889; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 890; GFX10-NEXT: s_mov_b32 s5, s1 891; GFX10-NEXT: s_waitcnt vmcnt(0) 892; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 893; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 894; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 895; GFX10-NEXT: s_endpgm 896; 897; GFX11-LABEL: v_mul64_sext_c: 898; GFX11: ; %bb.0: ; %entry 899; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 900; GFX11-NEXT: s_mov_b32 s6, -1 901; GFX11-NEXT: s_mov_b32 s7, 0x31016000 902; GFX11-NEXT: s_mov_b32 s10, s6 903; GFX11-NEXT: s_mov_b32 s11, s7 904; GFX11-NEXT: s_waitcnt lgkmcnt(0) 905; GFX11-NEXT: s_mov_b32 s8, s2 906; GFX11-NEXT: s_mov_b32 s9, s3 907; GFX11-NEXT: s_mov_b32 s4, s0 908; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 909; GFX11-NEXT: s_mov_b32 s5, s1 910; GFX11-NEXT: s_waitcnt vmcnt(0) 911; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 912; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 913; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 914; GFX11-NEXT: s_nop 0 915; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 916; GFX11-NEXT: s_endpgm 917; 918; GFX12-LABEL: v_mul64_sext_c: 919; GFX12: ; %bb.0: ; %entry 920; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 921; GFX12-NEXT: s_mov_b32 s6, -1 922; GFX12-NEXT: s_mov_b32 s7, 0x31016000 923; GFX12-NEXT: s_mov_b32 s10, s6 924; GFX12-NEXT: s_mov_b32 s11, s7 925; GFX12-NEXT: s_wait_kmcnt 0x0 926; GFX12-NEXT: s_mov_b32 s8, s2 927; GFX12-NEXT: s_mov_b32 s9, s3 928; GFX12-NEXT: s_mov_b32 s4, s0 929; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 930; GFX12-NEXT: s_mov_b32 s5, s1 931; GFX12-NEXT: s_wait_loadcnt 0x0 932; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 933; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 934; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 935; GFX12-NEXT: s_nop 0 936; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 937; GFX12-NEXT: s_endpgm 938; 939; EG-LABEL: v_mul64_sext_c: 940; EG: ; %bb.0: ; %entry 941; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 942; EG-NEXT: TEX 0 @6 943; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 944; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 945; EG-NEXT: CF_END 946; EG-NEXT: PAD 947; EG-NEXT: Fetch clause starting at 6: 948; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 949; EG-NEXT: ALU clause starting at 8: 950; EG-NEXT: MOV * T0.X, KC0[2].Z, 951; EG-NEXT: ALU clause starting at 9: 952; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 953; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 954; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 955; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 956; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 957entry: 958 %val = load i32, ptr addrspace(1) %in, align 4 959 %ext = sext i32 %val to i64 960 %mul = mul i64 %ext, 80 961 store i64 %mul, ptr addrspace(1) %out, align 8 962 ret void 963} 964 965define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 966; SI-LABEL: v_mul64_zext_c: 967; SI: ; %bb.0: ; %entry 968; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 969; SI-NEXT: s_mov_b32 s7, 0xf000 970; SI-NEXT: s_mov_b32 s6, -1 971; SI-NEXT: s_mov_b32 s10, s6 972; SI-NEXT: s_mov_b32 s11, s7 973; SI-NEXT: s_waitcnt lgkmcnt(0) 974; SI-NEXT: s_mov_b32 s8, s2 975; SI-NEXT: s_mov_b32 s9, s3 976; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 977; SI-NEXT: s_movk_i32 s2, 0x50 978; SI-NEXT: s_mov_b32 s4, s0 979; SI-NEXT: s_mov_b32 s5, s1 980; SI-NEXT: s_waitcnt vmcnt(0) 981; SI-NEXT: v_mul_hi_u32 v1, v0, s2 982; SI-NEXT: v_mul_lo_u32 v0, v0, s2 983; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 984; SI-NEXT: s_endpgm 985; 986; VI-LABEL: v_mul64_zext_c: 987; VI: ; %bb.0: ; %entry 988; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 989; VI-NEXT: s_mov_b32 s7, 0xf000 990; VI-NEXT: s_mov_b32 s6, -1 991; VI-NEXT: s_mov_b32 s10, s6 992; VI-NEXT: s_mov_b32 s11, s7 993; VI-NEXT: s_waitcnt lgkmcnt(0) 994; VI-NEXT: s_mov_b32 s8, s2 995; VI-NEXT: s_mov_b32 s9, s3 996; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 997; VI-NEXT: s_movk_i32 s2, 0x50 998; VI-NEXT: s_mov_b32 s4, s0 999; VI-NEXT: s_mov_b32 s5, s1 1000; VI-NEXT: s_waitcnt vmcnt(0) 1001; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0 1002; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1003; VI-NEXT: s_endpgm 1004; 1005; GFX9-LABEL: v_mul64_zext_c: 1006; GFX9: ; %bb.0: ; %entry 1007; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1008; GFX9-NEXT: s_mov_b32 s7, 0xf000 1009; GFX9-NEXT: s_mov_b32 s6, -1 1010; GFX9-NEXT: s_mov_b32 s10, s6 1011; GFX9-NEXT: s_mov_b32 s11, s7 1012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX9-NEXT: s_mov_b32 s8, s2 1014; GFX9-NEXT: s_mov_b32 s9, s3 1015; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 1016; GFX9-NEXT: s_movk_i32 s2, 0x50 1017; GFX9-NEXT: s_mov_b32 s4, s0 1018; GFX9-NEXT: s_mov_b32 s5, s1 1019; GFX9-NEXT: s_waitcnt vmcnt(0) 1020; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2 1021; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 1022; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1023; GFX9-NEXT: s_endpgm 1024; 1025; GFX10-LABEL: v_mul64_zext_c: 1026; GFX10: ; %bb.0: ; %entry 1027; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1028; GFX10-NEXT: s_mov_b32 s6, -1 1029; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1030; GFX10-NEXT: s_mov_b32 s10, s6 1031; GFX10-NEXT: s_mov_b32 s11, s7 1032; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1033; GFX10-NEXT: s_mov_b32 s8, s2 1034; GFX10-NEXT: s_mov_b32 s9, s3 1035; GFX10-NEXT: s_mov_b32 s4, s0 1036; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 1037; GFX10-NEXT: s_mov_b32 s5, s1 1038; GFX10-NEXT: s_waitcnt vmcnt(0) 1039; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0 1040; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 1041; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1042; GFX10-NEXT: s_endpgm 1043; 1044; GFX11-LABEL: v_mul64_zext_c: 1045; GFX11: ; %bb.0: ; %entry 1046; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1047; GFX11-NEXT: s_mov_b32 s6, -1 1048; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1049; GFX11-NEXT: s_mov_b32 s10, s6 1050; GFX11-NEXT: s_mov_b32 s11, s7 1051; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1052; GFX11-NEXT: s_mov_b32 s8, s2 1053; GFX11-NEXT: s_mov_b32 s9, s3 1054; GFX11-NEXT: s_mov_b32 s4, s0 1055; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 1056; GFX11-NEXT: s_mov_b32 s5, s1 1057; GFX11-NEXT: s_waitcnt vmcnt(0) 1058; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0 1059; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 1060; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1061; GFX11-NEXT: s_nop 0 1062; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1063; GFX11-NEXT: s_endpgm 1064; 1065; GFX12-LABEL: v_mul64_zext_c: 1066; GFX12: ; %bb.0: ; %entry 1067; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1068; GFX12-NEXT: s_mov_b32 s6, -1 1069; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1070; GFX12-NEXT: s_mov_b32 s10, s6 1071; GFX12-NEXT: s_mov_b32 s11, s7 1072; GFX12-NEXT: s_wait_kmcnt 0x0 1073; GFX12-NEXT: s_mov_b32 s8, s2 1074; GFX12-NEXT: s_mov_b32 s9, s3 1075; GFX12-NEXT: s_mov_b32 s4, s0 1076; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 1077; GFX12-NEXT: s_mov_b32 s5, s1 1078; GFX12-NEXT: s_wait_loadcnt 0x0 1079; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 1080; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 1081; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 1082; GFX12-NEXT: s_nop 0 1083; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1084; GFX12-NEXT: s_endpgm 1085; 1086; EG-LABEL: v_mul64_zext_c: 1087; EG: ; %bb.0: ; %entry 1088; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1089; EG-NEXT: TEX 0 @6 1090; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1091; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1092; EG-NEXT: CF_END 1093; EG-NEXT: PAD 1094; EG-NEXT: Fetch clause starting at 6: 1095; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1096; EG-NEXT: ALU clause starting at 8: 1097; EG-NEXT: MOV * T0.X, KC0[2].Z, 1098; EG-NEXT: ALU clause starting at 9: 1099; EG-NEXT: MULHI * T0.Y, T0.X, literal.x, 1100; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 1101; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1102; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 1103; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 1104entry: 1105 %val = load i32, ptr addrspace(1) %in, align 4 1106 %ext = zext i32 %val to i64 1107 %mul = mul i64 %ext, 80 1108 store i64 %mul, ptr addrspace(1) %out, align 8 1109 ret void 1110} 1111 1112define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1113; SI-LABEL: v_mul64_sext_inline_imm: 1114; SI: ; %bb.0: ; %entry 1115; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1116; SI-NEXT: s_mov_b32 s7, 0xf000 1117; SI-NEXT: s_mov_b32 s6, -1 1118; SI-NEXT: s_mov_b32 s10, s6 1119; SI-NEXT: s_mov_b32 s11, s7 1120; SI-NEXT: s_waitcnt lgkmcnt(0) 1121; SI-NEXT: s_mov_b32 s8, s2 1122; SI-NEXT: s_mov_b32 s9, s3 1123; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1124; SI-NEXT: s_mov_b32 s4, s0 1125; SI-NEXT: s_mov_b32 s5, s1 1126; SI-NEXT: s_waitcnt vmcnt(0) 1127; SI-NEXT: v_mul_hi_i32 v1, v0, 9 1128; SI-NEXT: v_mul_lo_u32 v0, v0, 9 1129; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1130; SI-NEXT: s_endpgm 1131; 1132; VI-LABEL: v_mul64_sext_inline_imm: 1133; VI: ; %bb.0: ; %entry 1134; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1135; VI-NEXT: s_mov_b32 s7, 0xf000 1136; VI-NEXT: s_mov_b32 s6, -1 1137; VI-NEXT: s_mov_b32 s10, s6 1138; VI-NEXT: s_mov_b32 s11, s7 1139; VI-NEXT: s_waitcnt lgkmcnt(0) 1140; VI-NEXT: s_mov_b32 s8, s2 1141; VI-NEXT: s_mov_b32 s9, s3 1142; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1143; VI-NEXT: s_mov_b32 s4, s0 1144; VI-NEXT: s_mov_b32 s5, s1 1145; VI-NEXT: s_waitcnt vmcnt(0) 1146; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0 1147; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1148; VI-NEXT: s_endpgm 1149; 1150; GFX9-LABEL: v_mul64_sext_inline_imm: 1151; GFX9: ; %bb.0: ; %entry 1152; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1153; GFX9-NEXT: s_mov_b32 s7, 0xf000 1154; GFX9-NEXT: s_mov_b32 s6, -1 1155; GFX9-NEXT: s_mov_b32 s10, s6 1156; GFX9-NEXT: s_mov_b32 s11, s7 1157; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX9-NEXT: s_mov_b32 s8, s2 1159; GFX9-NEXT: s_mov_b32 s9, s3 1160; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 1161; GFX9-NEXT: s_mov_b32 s4, s0 1162; GFX9-NEXT: s_mov_b32 s5, s1 1163; GFX9-NEXT: s_waitcnt vmcnt(0) 1164; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 1165; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 1166; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1167; GFX9-NEXT: s_endpgm 1168; 1169; GFX10-LABEL: v_mul64_sext_inline_imm: 1170; GFX10: ; %bb.0: ; %entry 1171; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1172; GFX10-NEXT: s_mov_b32 s6, -1 1173; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1174; GFX10-NEXT: s_mov_b32 s10, s6 1175; GFX10-NEXT: s_mov_b32 s11, s7 1176; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX10-NEXT: s_mov_b32 s8, s2 1178; GFX10-NEXT: s_mov_b32 s9, s3 1179; GFX10-NEXT: s_mov_b32 s4, s0 1180; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 1181; GFX10-NEXT: s_mov_b32 s5, s1 1182; GFX10-NEXT: s_waitcnt vmcnt(0) 1183; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 1184; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 1185; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1186; GFX10-NEXT: s_endpgm 1187; 1188; GFX11-LABEL: v_mul64_sext_inline_imm: 1189; GFX11: ; %bb.0: ; %entry 1190; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1191; GFX11-NEXT: s_mov_b32 s6, -1 1192; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1193; GFX11-NEXT: s_mov_b32 s10, s6 1194; GFX11-NEXT: s_mov_b32 s11, s7 1195; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1196; GFX11-NEXT: s_mov_b32 s8, s2 1197; GFX11-NEXT: s_mov_b32 s9, s3 1198; GFX11-NEXT: s_mov_b32 s4, s0 1199; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 1200; GFX11-NEXT: s_mov_b32 s5, s1 1201; GFX11-NEXT: s_waitcnt vmcnt(0) 1202; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 1203; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 1204; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1205; GFX11-NEXT: s_nop 0 1206; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1207; GFX11-NEXT: s_endpgm 1208; 1209; GFX12-LABEL: v_mul64_sext_inline_imm: 1210; GFX12: ; %bb.0: ; %entry 1211; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1212; GFX12-NEXT: s_mov_b32 s6, -1 1213; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1214; GFX12-NEXT: s_mov_b32 s10, s6 1215; GFX12-NEXT: s_mov_b32 s11, s7 1216; GFX12-NEXT: s_wait_kmcnt 0x0 1217; GFX12-NEXT: s_mov_b32 s8, s2 1218; GFX12-NEXT: s_mov_b32 s9, s3 1219; GFX12-NEXT: s_mov_b32 s4, s0 1220; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 1221; GFX12-NEXT: s_mov_b32 s5, s1 1222; GFX12-NEXT: s_wait_loadcnt 0x0 1223; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 1224; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 1225; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 1226; GFX12-NEXT: s_nop 0 1227; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1228; GFX12-NEXT: s_endpgm 1229; 1230; EG-LABEL: v_mul64_sext_inline_imm: 1231; EG: ; %bb.0: ; %entry 1232; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1233; EG-NEXT: TEX 0 @6 1234; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1235; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1236; EG-NEXT: CF_END 1237; EG-NEXT: PAD 1238; EG-NEXT: Fetch clause starting at 6: 1239; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1240; EG-NEXT: ALU clause starting at 8: 1241; EG-NEXT: MOV * T0.X, KC0[2].Z, 1242; EG-NEXT: ALU clause starting at 9: 1243; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 1244; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1245; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1246; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 1247; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) 1248entry: 1249 %val = load i32, ptr addrspace(1) %in, align 4 1250 %ext = sext i32 %val to i64 1251 %mul = mul i64 %ext, 9 1252 store i64 %mul, ptr addrspace(1) %out, align 8 1253 ret void 1254} 1255 1256define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { 1257; SI-LABEL: s_mul_i32: 1258; SI: ; %bb.0: ; %entry 1259; SI-NEXT: s_load_dword s4, s[0:1], 0x13 1260; SI-NEXT: s_load_dword s5, s[0:1], 0x1c 1261; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1262; SI-NEXT: s_mov_b32 s3, 0xf000 1263; SI-NEXT: s_mov_b32 s2, -1 1264; SI-NEXT: s_waitcnt lgkmcnt(0) 1265; SI-NEXT: s_mul_i32 s4, s4, s5 1266; SI-NEXT: v_mov_b32_e32 v0, s4 1267; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1268; SI-NEXT: s_endpgm 1269; 1270; VI-LABEL: s_mul_i32: 1271; VI: ; %bb.0: ; %entry 1272; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 1273; VI-NEXT: s_load_dword s5, s[0:1], 0x70 1274; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1275; VI-NEXT: s_mov_b32 s3, 0xf000 1276; VI-NEXT: s_mov_b32 s2, -1 1277; VI-NEXT: s_waitcnt lgkmcnt(0) 1278; VI-NEXT: s_mul_i32 s4, s4, s5 1279; VI-NEXT: v_mov_b32_e32 v0, s4 1280; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1281; VI-NEXT: s_endpgm 1282; 1283; GFX9-LABEL: s_mul_i32: 1284; GFX9: ; %bb.0: ; %entry 1285; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c 1286; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 1287; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1288; GFX9-NEXT: s_mov_b32 s7, 0xf000 1289; GFX9-NEXT: s_mov_b32 s6, -1 1290; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX9-NEXT: s_mul_i32 s0, s2, s3 1292; GFX9-NEXT: v_mov_b32_e32 v0, s0 1293; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1294; GFX9-NEXT: s_endpgm 1295; 1296; GFX10-LABEL: s_mul_i32: 1297; GFX10: ; %bb.0: ; %entry 1298; GFX10-NEXT: s_clause 0x2 1299; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 1300; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 1301; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1302; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1303; GFX10-NEXT: s_mov_b32 s6, -1 1304; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX10-NEXT: s_mul_i32 s0, s2, s3 1306; GFX10-NEXT: v_mov_b32_e32 v0, s0 1307; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 1308; GFX10-NEXT: s_endpgm 1309; 1310; GFX11-LABEL: s_mul_i32: 1311; GFX11: ; %bb.0: ; %entry 1312; GFX11-NEXT: s_clause 0x2 1313; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c 1314; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 1315; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1316; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1317; GFX11-NEXT: s_mul_i32 s2, s2, s3 1318; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1319; GFX11-NEXT: v_mov_b32_e32 v0, s2 1320; GFX11-NEXT: s_mov_b32 s2, -1 1321; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1322; GFX11-NEXT: s_nop 0 1323; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1324; GFX11-NEXT: s_endpgm 1325; 1326; GFX12-LABEL: s_mul_i32: 1327; GFX12: ; %bb.0: ; %entry 1328; GFX12-NEXT: s_clause 0x2 1329; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c 1330; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 1331; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1332; GFX12-NEXT: s_wait_kmcnt 0x0 1333; GFX12-NEXT: s_mul_i32 s2, s2, s3 1334; GFX12-NEXT: s_mov_b32 s3, 0x31016000 1335; GFX12-NEXT: v_mov_b32_e32 v0, s2 1336; GFX12-NEXT: s_mov_b32 s2, -1 1337; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null 1338; GFX12-NEXT: s_nop 0 1339; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1340; GFX12-NEXT: s_endpgm 1341; 1342; EG-LABEL: s_mul_i32: 1343; EG: ; %bb.0: ; %entry 1344; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1345; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1346; EG-NEXT: CF_END 1347; EG-NEXT: PAD 1348; EG-NEXT: ALU clause starting at 4: 1349; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1350; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1351; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W, 1352entry: 1353 %mul = mul i32 %a, %b 1354 store i32 %mul, ptr addrspace(1) %out, align 4 1355 ret void 1356} 1357 1358define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1359; SI-LABEL: v_mul_i32: 1360; SI: ; %bb.0: ; %entry 1361; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1362; SI-NEXT: s_mov_b32 s7, 0xf000 1363; SI-NEXT: s_mov_b32 s6, -1 1364; SI-NEXT: s_mov_b32 s10, s6 1365; SI-NEXT: s_mov_b32 s11, s7 1366; SI-NEXT: s_waitcnt lgkmcnt(0) 1367; SI-NEXT: s_mov_b32 s8, s2 1368; SI-NEXT: s_mov_b32 s9, s3 1369; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1370; SI-NEXT: s_mov_b32 s4, s0 1371; SI-NEXT: s_mov_b32 s5, s1 1372; SI-NEXT: s_waitcnt vmcnt(0) 1373; SI-NEXT: v_mul_lo_u32 v0, v0, v1 1374; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1375; SI-NEXT: s_endpgm 1376; 1377; VI-LABEL: v_mul_i32: 1378; VI: ; %bb.0: ; %entry 1379; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1380; VI-NEXT: s_mov_b32 s7, 0xf000 1381; VI-NEXT: s_mov_b32 s6, -1 1382; VI-NEXT: s_mov_b32 s10, s6 1383; VI-NEXT: s_mov_b32 s11, s7 1384; VI-NEXT: s_waitcnt lgkmcnt(0) 1385; VI-NEXT: s_mov_b32 s8, s2 1386; VI-NEXT: s_mov_b32 s9, s3 1387; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1388; VI-NEXT: s_mov_b32 s4, s0 1389; VI-NEXT: s_mov_b32 s5, s1 1390; VI-NEXT: s_waitcnt vmcnt(0) 1391; VI-NEXT: v_mul_lo_u32 v0, v0, v1 1392; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1393; VI-NEXT: s_endpgm 1394; 1395; GFX9-LABEL: v_mul_i32: 1396; GFX9: ; %bb.0: ; %entry 1397; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1398; GFX9-NEXT: s_mov_b32 s7, 0xf000 1399; GFX9-NEXT: s_mov_b32 s6, -1 1400; GFX9-NEXT: s_mov_b32 s10, s6 1401; GFX9-NEXT: s_mov_b32 s11, s7 1402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1403; GFX9-NEXT: s_mov_b32 s8, s2 1404; GFX9-NEXT: s_mov_b32 s9, s3 1405; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1406; GFX9-NEXT: s_mov_b32 s4, s0 1407; GFX9-NEXT: s_mov_b32 s5, s1 1408; GFX9-NEXT: s_waitcnt vmcnt(0) 1409; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 1410; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1411; GFX9-NEXT: s_endpgm 1412; 1413; GFX10-LABEL: v_mul_i32: 1414; GFX10: ; %bb.0: ; %entry 1415; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1416; GFX10-NEXT: s_mov_b32 s6, -1 1417; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1418; GFX10-NEXT: s_mov_b32 s10, s6 1419; GFX10-NEXT: s_mov_b32 s11, s7 1420; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1421; GFX10-NEXT: s_mov_b32 s8, s2 1422; GFX10-NEXT: s_mov_b32 s9, s3 1423; GFX10-NEXT: s_mov_b32 s4, s0 1424; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1425; GFX10-NEXT: s_mov_b32 s5, s1 1426; GFX10-NEXT: s_waitcnt vmcnt(0) 1427; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 1428; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 1429; GFX10-NEXT: s_endpgm 1430; 1431; GFX11-LABEL: v_mul_i32: 1432; GFX11: ; %bb.0: ; %entry 1433; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1434; GFX11-NEXT: s_mov_b32 s6, -1 1435; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1436; GFX11-NEXT: s_mov_b32 s10, s6 1437; GFX11-NEXT: s_mov_b32 s11, s7 1438; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1439; GFX11-NEXT: s_mov_b32 s8, s2 1440; GFX11-NEXT: s_mov_b32 s9, s3 1441; GFX11-NEXT: s_mov_b32 s4, s0 1442; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 1443; GFX11-NEXT: s_mov_b32 s5, s1 1444; GFX11-NEXT: s_waitcnt vmcnt(0) 1445; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 1446; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 1447; GFX11-NEXT: s_nop 0 1448; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1449; GFX11-NEXT: s_endpgm 1450; 1451; GFX12-LABEL: v_mul_i32: 1452; GFX12: ; %bb.0: ; %entry 1453; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1454; GFX12-NEXT: s_mov_b32 s6, -1 1455; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1456; GFX12-NEXT: s_mov_b32 s10, s6 1457; GFX12-NEXT: s_mov_b32 s11, s7 1458; GFX12-NEXT: s_wait_kmcnt 0x0 1459; GFX12-NEXT: s_mov_b32 s8, s2 1460; GFX12-NEXT: s_mov_b32 s9, s3 1461; GFX12-NEXT: s_mov_b32 s4, s0 1462; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null 1463; GFX12-NEXT: s_mov_b32 s5, s1 1464; GFX12-NEXT: s_wait_loadcnt 0x0 1465; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 1466; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null 1467; GFX12-NEXT: s_nop 0 1468; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1469; GFX12-NEXT: s_endpgm 1470; 1471; EG-LABEL: v_mul_i32: 1472; EG: ; %bb.0: ; %entry 1473; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1474; EG-NEXT: TEX 0 @6 1475; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1476; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1477; EG-NEXT: CF_END 1478; EG-NEXT: PAD 1479; EG-NEXT: Fetch clause starting at 6: 1480; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1481; EG-NEXT: ALU clause starting at 8: 1482; EG-NEXT: MOV * T0.X, KC0[2].Z, 1483; EG-NEXT: ALU clause starting at 9: 1484; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1485; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y, 1486; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1487entry: 1488 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1489 %a = load i32, ptr addrspace(1) %in 1490 %b = load i32, ptr addrspace(1) %b_ptr 1491 %result = mul i32 %a, %b 1492 store i32 %result, ptr addrspace(1) %out 1493 ret void 1494} 1495 1496define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { 1497; SI-LABEL: s_mul_i1: 1498; SI: ; %bb.0: ; %entry 1499; SI-NEXT: s_load_dword s4, s[0:1], 0x13 1500; SI-NEXT: s_load_dword s5, s[0:1], 0x1c 1501; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1502; SI-NEXT: s_mov_b32 s3, 0xf000 1503; SI-NEXT: s_mov_b32 s2, -1 1504; SI-NEXT: s_waitcnt lgkmcnt(0) 1505; SI-NEXT: s_mul_i32 s4, s4, s5 1506; SI-NEXT: s_and_b32 s4, s4, 1 1507; SI-NEXT: v_mov_b32_e32 v0, s4 1508; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1509; SI-NEXT: s_endpgm 1510; 1511; VI-LABEL: s_mul_i1: 1512; VI: ; %bb.0: ; %entry 1513; VI-NEXT: s_load_dword s4, s[0:1], 0x70 1514; VI-NEXT: s_load_dword s5, s[0:1], 0x4c 1515; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1516; VI-NEXT: s_mov_b32 s3, 0xf000 1517; VI-NEXT: s_mov_b32 s2, -1 1518; VI-NEXT: s_waitcnt lgkmcnt(0) 1519; VI-NEXT: v_mov_b32_e32 v0, s4 1520; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0 1521; VI-NEXT: v_and_b32_e32 v0, 1, v0 1522; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1523; VI-NEXT: s_endpgm 1524; 1525; GFX9-LABEL: s_mul_i1: 1526; GFX9: ; %bb.0: ; %entry 1527; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 1528; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c 1529; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1530; GFX9-NEXT: s_mov_b32 s7, 0xf000 1531; GFX9-NEXT: s_mov_b32 s6, -1 1532; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1533; GFX9-NEXT: v_mov_b32_e32 v0, s2 1534; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 1535; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1536; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 1537; GFX9-NEXT: s_endpgm 1538; 1539; GFX10-LABEL: s_mul_i1: 1540; GFX10: ; %bb.0: ; %entry 1541; GFX10-NEXT: s_clause 0x2 1542; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 1543; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 1544; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1545; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1546; GFX10-NEXT: s_mov_b32 s6, -1 1547; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1548; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 1549; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 1550; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 1551; GFX10-NEXT: s_endpgm 1552; 1553; GFX11-LABEL: s_mul_i1: 1554; GFX11: ; %bb.0: ; %entry 1555; GFX11-NEXT: s_clause 0x2 1556; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c 1557; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 1558; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1559; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 1561; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1562; GFX11-NEXT: s_mov_b32 s2, -1 1563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1564; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 1565; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 1566; GFX11-NEXT: s_nop 0 1567; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1568; GFX11-NEXT: s_endpgm 1569; 1570; GFX12-LABEL: s_mul_i1: 1571; GFX12: ; %bb.0: ; %entry 1572; GFX12-NEXT: s_clause 0x2 1573; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c 1574; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70 1575; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1576; GFX12-NEXT: s_wait_kmcnt 0x0 1577; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3 1578; GFX12-NEXT: s_mov_b32 s3, 0x31016000 1579; GFX12-NEXT: s_mov_b32 s2, -1 1580; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1581; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 1582; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null 1583; GFX12-NEXT: s_nop 0 1584; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1585; GFX12-NEXT: s_endpgm 1586; 1587; EG-LABEL: s_mul_i1: 1588; EG: ; %bb.0: ; %entry 1589; EG-NEXT: ALU 0, @10, KC0[], KC1[] 1590; EG-NEXT: TEX 1 @6 1591; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1592; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1593; EG-NEXT: CF_END 1594; EG-NEXT: PAD 1595; EG-NEXT: Fetch clause starting at 6: 1596; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3 1597; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3 1598; EG-NEXT: ALU clause starting at 10: 1599; EG-NEXT: MOV * T0.X, 0.0, 1600; EG-NEXT: ALU clause starting at 11: 1601; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1602; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 1603; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1604; EG-NEXT: AND_INT T1.W, PS, 1, 1605; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1606; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1607; EG-NEXT: LSHL T0.X, PV.W, PS, 1608; EG-NEXT: LSHL * T0.W, literal.x, PS, 1609; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1610; EG-NEXT: MOV T0.Y, 0.0, 1611; EG-NEXT: MOV * T0.Z, 0.0, 1612; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1613; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1614entry: 1615 %mul = mul i1 %a, %b 1616 store i1 %mul, ptr addrspace(1) %out, align 4 1617 ret void 1618} 1619 1620define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1621; SI-LABEL: v_mul_i1: 1622; SI: ; %bb.0: ; %entry 1623; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1624; SI-NEXT: s_mov_b32 s7, 0xf000 1625; SI-NEXT: s_mov_b32 s6, -1 1626; SI-NEXT: s_mov_b32 s10, s6 1627; SI-NEXT: s_mov_b32 s11, s7 1628; SI-NEXT: s_waitcnt lgkmcnt(0) 1629; SI-NEXT: s_mov_b32 s8, s2 1630; SI-NEXT: s_mov_b32 s9, s3 1631; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1632; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1633; SI-NEXT: s_mov_b32 s4, s0 1634; SI-NEXT: s_mov_b32 s5, s1 1635; SI-NEXT: s_waitcnt vmcnt(0) 1636; SI-NEXT: v_mul_lo_u32 v0, v0, v1 1637; SI-NEXT: v_and_b32_e32 v0, 1, v0 1638; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1639; SI-NEXT: s_endpgm 1640; 1641; VI-LABEL: v_mul_i1: 1642; VI: ; %bb.0: ; %entry 1643; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1644; VI-NEXT: s_mov_b32 s7, 0xf000 1645; VI-NEXT: s_mov_b32 s6, -1 1646; VI-NEXT: s_mov_b32 s10, s6 1647; VI-NEXT: s_mov_b32 s11, s7 1648; VI-NEXT: s_waitcnt lgkmcnt(0) 1649; VI-NEXT: s_mov_b32 s8, s2 1650; VI-NEXT: s_mov_b32 s9, s3 1651; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1652; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1653; VI-NEXT: s_mov_b32 s4, s0 1654; VI-NEXT: s_mov_b32 s5, s1 1655; VI-NEXT: s_waitcnt vmcnt(0) 1656; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1 1657; VI-NEXT: v_and_b32_e32 v0, 1, v0 1658; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1659; VI-NEXT: s_endpgm 1660; 1661; GFX9-LABEL: v_mul_i1: 1662; GFX9: ; %bb.0: ; %entry 1663; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1664; GFX9-NEXT: s_mov_b32 s7, 0xf000 1665; GFX9-NEXT: s_mov_b32 s6, -1 1666; GFX9-NEXT: s_mov_b32 s10, s6 1667; GFX9-NEXT: s_mov_b32 s11, s7 1668; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX9-NEXT: s_mov_b32 s8, s2 1670; GFX9-NEXT: s_mov_b32 s9, s3 1671; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1672; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1673; GFX9-NEXT: s_mov_b32 s4, s0 1674; GFX9-NEXT: s_mov_b32 s5, s1 1675; GFX9-NEXT: s_waitcnt vmcnt(0) 1676; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 1677; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1678; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 1679; GFX9-NEXT: s_endpgm 1680; 1681; GFX10-LABEL: v_mul_i1: 1682; GFX10: ; %bb.0: ; %entry 1683; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1684; GFX10-NEXT: s_mov_b32 s6, -1 1685; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1686; GFX10-NEXT: s_mov_b32 s10, s6 1687; GFX10-NEXT: s_mov_b32 s11, s7 1688; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1689; GFX10-NEXT: s_mov_b32 s8, s2 1690; GFX10-NEXT: s_mov_b32 s9, s3 1691; GFX10-NEXT: s_clause 0x1 1692; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1693; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1694; GFX10-NEXT: s_mov_b32 s4, s0 1695; GFX10-NEXT: s_mov_b32 s5, s1 1696; GFX10-NEXT: s_waitcnt vmcnt(0) 1697; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 1698; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 1699; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 1700; GFX10-NEXT: s_endpgm 1701; 1702; GFX11-LABEL: v_mul_i1: 1703; GFX11: ; %bb.0: ; %entry 1704; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1705; GFX11-NEXT: s_mov_b32 s6, -1 1706; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1707; GFX11-NEXT: s_mov_b32 s10, s6 1708; GFX11-NEXT: s_mov_b32 s11, s7 1709; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1710; GFX11-NEXT: s_mov_b32 s8, s2 1711; GFX11-NEXT: s_mov_b32 s9, s3 1712; GFX11-NEXT: s_clause 0x1 1713; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0 1714; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4 1715; GFX11-NEXT: s_mov_b32 s4, s0 1716; GFX11-NEXT: s_mov_b32 s5, s1 1717; GFX11-NEXT: s_waitcnt vmcnt(0) 1718; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1 1719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1720; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 1721; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 1722; GFX11-NEXT: s_nop 0 1723; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1724; GFX11-NEXT: s_endpgm 1725; 1726; GFX12-LABEL: v_mul_i1: 1727; GFX12: ; %bb.0: ; %entry 1728; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1729; GFX12-NEXT: s_mov_b32 s6, -1 1730; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1731; GFX12-NEXT: s_mov_b32 s10, s6 1732; GFX12-NEXT: s_mov_b32 s11, s7 1733; GFX12-NEXT: s_wait_kmcnt 0x0 1734; GFX12-NEXT: s_mov_b32 s8, s2 1735; GFX12-NEXT: s_mov_b32 s9, s3 1736; GFX12-NEXT: s_clause 0x1 1737; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null 1738; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 1739; GFX12-NEXT: s_mov_b32 s4, s0 1740; GFX12-NEXT: s_mov_b32 s5, s1 1741; GFX12-NEXT: s_wait_loadcnt 0x0 1742; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 1743; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1744; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 1745; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null 1746; GFX12-NEXT: s_nop 0 1747; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1748; GFX12-NEXT: s_endpgm 1749; 1750; EG-LABEL: v_mul_i1: 1751; EG: ; %bb.0: ; %entry 1752; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1753; EG-NEXT: TEX 1 @6 1754; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1755; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1756; EG-NEXT: CF_END 1757; EG-NEXT: PAD 1758; EG-NEXT: Fetch clause starting at 6: 1759; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1 1760; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1761; EG-NEXT: ALU clause starting at 10: 1762; EG-NEXT: MOV * T0.X, KC0[2].Z, 1763; EG-NEXT: ALU clause starting at 11: 1764; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1765; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 1766; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1767; EG-NEXT: AND_INT T1.W, PS, 1, 1768; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1769; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1770; EG-NEXT: LSHL T0.X, PV.W, PS, 1771; EG-NEXT: LSHL * T0.W, literal.x, PS, 1772; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1773; EG-NEXT: MOV T0.Y, 0.0, 1774; EG-NEXT: MOV * T0.Z, 0.0, 1775; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1776; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1777entry: 1778 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1779 %a = load i1, ptr addrspace(1) %in 1780 %b = load i1, ptr addrspace(1) %b_ptr 1781 %result = mul i1 %a, %b 1782 store i1 %result, ptr addrspace(1) %out 1783 ret void 1784} 1785 1786; A standard 64-bit multiply. The expansion should be around 6 instructions. 1787; It would be difficult to match the expansion correctly without writing 1788; a really complicated list of FileCheck expressions. I don't want 1789; to confuse people who may 'break' this test with a correct optimization, 1790; so this test just uses FUNC-LABEL to make sure the compiler does not 1791; crash with a 'failed to select' error. 1792 1793define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 1794; SI-LABEL: s_mul_i64: 1795; SI: ; %bb.0: ; %entry 1796; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1797; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1798; SI-NEXT: s_mov_b32 s3, 0xf000 1799; SI-NEXT: s_mov_b32 s2, -1 1800; SI-NEXT: s_waitcnt lgkmcnt(0) 1801; SI-NEXT: s_mov_b32 s0, s4 1802; SI-NEXT: v_mov_b32_e32 v0, s8 1803; SI-NEXT: v_mul_hi_u32 v0, s6, v0 1804; SI-NEXT: s_mul_i32 s4, s6, s9 1805; SI-NEXT: s_mov_b32 s1, s5 1806; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 1807; SI-NEXT: s_mul_i32 s4, s7, s8 1808; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0 1809; SI-NEXT: s_mul_i32 s4, s6, s8 1810; SI-NEXT: v_mov_b32_e32 v0, s4 1811; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1812; SI-NEXT: s_endpgm 1813; 1814; VI-LABEL: s_mul_i64: 1815; VI: ; %bb.0: ; %entry 1816; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1817; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1818; VI-NEXT: s_mov_b32 s3, 0xf000 1819; VI-NEXT: s_mov_b32 s2, -1 1820; VI-NEXT: s_waitcnt lgkmcnt(0) 1821; VI-NEXT: s_mov_b32 s0, s4 1822; VI-NEXT: v_mov_b32_e32 v0, s8 1823; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0 1824; VI-NEXT: s_mul_i32 s4, s6, s9 1825; VI-NEXT: s_mov_b32 s1, s5 1826; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1827; VI-NEXT: s_mul_i32 s4, s7, s8 1828; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1829; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1830; VI-NEXT: s_endpgm 1831; 1832; GFX9-LABEL: s_mul_i64: 1833; GFX9: ; %bb.0: ; %entry 1834; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1835; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1836; GFX9-NEXT: s_mov_b32 s3, 0xf000 1837; GFX9-NEXT: s_mov_b32 s2, -1 1838; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1839; GFX9-NEXT: s_mov_b32 s0, s4 1840; GFX9-NEXT: s_mov_b32 s1, s5 1841; GFX9-NEXT: s_mul_i32 s4, s6, s9 1842; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 1843; GFX9-NEXT: s_add_i32 s4, s5, s4 1844; GFX9-NEXT: s_mul_i32 s5, s7, s8 1845; GFX9-NEXT: s_add_i32 s4, s4, s5 1846; GFX9-NEXT: s_mul_i32 s5, s6, s8 1847; GFX9-NEXT: v_mov_b32_e32 v0, s5 1848; GFX9-NEXT: v_mov_b32_e32 v1, s4 1849; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1850; GFX9-NEXT: s_endpgm 1851; 1852; GFX10-LABEL: s_mul_i64: 1853; GFX10: ; %bb.0: ; %entry 1854; GFX10-NEXT: s_clause 0x1 1855; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1856; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1857; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1858; GFX10-NEXT: s_mul_i32 s0, s6, s3 1859; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 1860; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1861; GFX10-NEXT: s_add_i32 s0, s1, s0 1862; GFX10-NEXT: s_mul_i32 s1, s7, s2 1863; GFX10-NEXT: s_mul_i32 s2, s6, s2 1864; GFX10-NEXT: s_add_i32 s0, s0, s1 1865; GFX10-NEXT: v_mov_b32_e32 v0, s2 1866; GFX10-NEXT: v_mov_b32_e32 v1, s0 1867; GFX10-NEXT: s_mov_b32 s2, -1 1868; GFX10-NEXT: s_mov_b32 s0, s4 1869; GFX10-NEXT: s_mov_b32 s1, s5 1870; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1871; GFX10-NEXT: s_endpgm 1872; 1873; GFX11-LABEL: s_mul_i64: 1874; GFX11: ; %bb.0: ; %entry 1875; GFX11-NEXT: s_clause 0x1 1876; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1877; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1878; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1879; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1880; GFX11-NEXT: s_mul_i32 s1, s6, s1 1881; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0 1882; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 1883; GFX11-NEXT: s_add_i32 s1, s2, s1 1884; GFX11-NEXT: s_mul_i32 s2, s7, s0 1885; GFX11-NEXT: s_mul_i32 s0, s6, s0 1886; GFX11-NEXT: s_add_i32 s1, s1, s2 1887; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1888; GFX11-NEXT: s_mov_b32 s2, -1 1889; GFX11-NEXT: s_mov_b32 s0, s4 1890; GFX11-NEXT: s_mov_b32 s1, s5 1891; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1892; GFX11-NEXT: s_nop 0 1893; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1894; GFX11-NEXT: s_endpgm 1895; 1896; GFX12-LABEL: s_mul_i64: 1897; GFX12: ; %bb.0: ; %entry 1898; GFX12-NEXT: s_clause 0x1 1899; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1900; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1901; GFX12-NEXT: s_wait_kmcnt 0x0 1902; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1] 1903; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1904; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1905; GFX12-NEXT: s_mov_b32 s6, -1 1906; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 1907; GFX12-NEXT: s_nop 0 1908; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1909; GFX12-NEXT: s_endpgm 1910; 1911; EG-LABEL: s_mul_i64: 1912; EG: ; %bb.0: ; %entry 1913; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1914; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1915; EG-NEXT: CF_END 1916; EG-NEXT: PAD 1917; EG-NEXT: ALU clause starting at 4: 1918; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y, 1919; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z, 1920; EG-NEXT: ADD_INT T0.W, T0.X, PS, 1921; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y, 1922; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1923; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1924; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1925; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y, 1926entry: 1927 %mul = mul i64 %a, %b 1928 store i64 %mul, ptr addrspace(1) %out, align 8 1929 ret void 1930} 1931 1932define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 1933; SI-LABEL: v_mul_i64: 1934; SI: ; %bb.0: ; %entry 1935; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1936; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1937; SI-NEXT: s_mov_b32 s3, 0xf000 1938; SI-NEXT: s_mov_b32 s2, -1 1939; SI-NEXT: s_mov_b32 s10, s2 1940; SI-NEXT: s_mov_b32 s11, s3 1941; SI-NEXT: s_waitcnt lgkmcnt(0) 1942; SI-NEXT: s_mov_b32 s12, s6 1943; SI-NEXT: s_mov_b32 s13, s7 1944; SI-NEXT: s_mov_b32 s14, s2 1945; SI-NEXT: s_mov_b32 s15, s3 1946; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1947; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1948; SI-NEXT: s_mov_b32 s0, s4 1949; SI-NEXT: s_mov_b32 s1, s5 1950; SI-NEXT: s_waitcnt vmcnt(0) 1951; SI-NEXT: v_mul_lo_u32 v1, v2, v1 1952; SI-NEXT: v_mul_hi_u32 v4, v2, v0 1953; SI-NEXT: v_mul_lo_u32 v3, v3, v0 1954; SI-NEXT: v_mul_lo_u32 v0, v2, v0 1955; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1956; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1957; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1958; SI-NEXT: s_endpgm 1959; 1960; VI-LABEL: v_mul_i64: 1961; VI: ; %bb.0: ; %entry 1962; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1963; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1964; VI-NEXT: s_mov_b32 s3, 0xf000 1965; VI-NEXT: s_mov_b32 s2, -1 1966; VI-NEXT: s_mov_b32 s10, s2 1967; VI-NEXT: s_mov_b32 s11, s3 1968; VI-NEXT: s_waitcnt lgkmcnt(0) 1969; VI-NEXT: s_mov_b32 s12, s6 1970; VI-NEXT: s_mov_b32 s13, s7 1971; VI-NEXT: s_mov_b32 s14, s2 1972; VI-NEXT: s_mov_b32 s15, s3 1973; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1974; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1975; VI-NEXT: s_mov_b32 s0, s4 1976; VI-NEXT: s_mov_b32 s1, s5 1977; VI-NEXT: s_waitcnt vmcnt(0) 1978; VI-NEXT: v_mul_lo_u32 v4, v2, v1 1979; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 1980; VI-NEXT: v_mul_lo_u32 v0, v3, v0 1981; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 1982; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 1983; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 1984; VI-NEXT: s_endpgm 1985; 1986; GFX9-LABEL: v_mul_i64: 1987; GFX9: ; %bb.0: ; %entry 1988; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1989; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1990; GFX9-NEXT: s_mov_b32 s3, 0xf000 1991; GFX9-NEXT: s_mov_b32 s2, -1 1992; GFX9-NEXT: s_mov_b32 s10, s2 1993; GFX9-NEXT: s_mov_b32 s11, s3 1994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1995; GFX9-NEXT: s_mov_b32 s12, s6 1996; GFX9-NEXT: s_mov_b32 s13, s7 1997; GFX9-NEXT: s_mov_b32 s14, s2 1998; GFX9-NEXT: s_mov_b32 s15, s3 1999; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2000; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 2001; GFX9-NEXT: s_mov_b32 s0, s4 2002; GFX9-NEXT: s_mov_b32 s1, s5 2003; GFX9-NEXT: s_waitcnt vmcnt(0) 2004; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 2005; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 2006; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 2007; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 2008; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 2009; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2010; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2011; GFX9-NEXT: s_endpgm 2012; 2013; GFX10-LABEL: v_mul_i64: 2014; GFX10: ; %bb.0: ; %entry 2015; GFX10-NEXT: s_clause 0x1 2016; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 2017; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 2018; GFX10-NEXT: s_mov_b32 s2, -1 2019; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2020; GFX10-NEXT: s_mov_b32 s10, s2 2021; GFX10-NEXT: s_mov_b32 s11, s3 2022; GFX10-NEXT: s_mov_b32 s14, s2 2023; GFX10-NEXT: s_mov_b32 s15, s3 2024; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2025; GFX10-NEXT: s_mov_b32 s12, s6 2026; GFX10-NEXT: s_mov_b32 s13, s7 2027; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2028; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 2029; GFX10-NEXT: s_mov_b32 s0, s4 2030; GFX10-NEXT: s_mov_b32 s1, s5 2031; GFX10-NEXT: s_waitcnt vmcnt(0) 2032; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 2033; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 2034; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 2035; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 2036; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 2037; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 2038; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2039; GFX10-NEXT: s_endpgm 2040; 2041; GFX11-LABEL: v_mul_i64: 2042; GFX11: ; %bb.0: ; %entry 2043; GFX11-NEXT: s_clause 0x1 2044; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2045; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2046; GFX11-NEXT: s_mov_b32 s10, -1 2047; GFX11-NEXT: s_mov_b32 s11, 0x31016000 2048; GFX11-NEXT: s_mov_b32 s2, s10 2049; GFX11-NEXT: s_mov_b32 s3, s11 2050; GFX11-NEXT: s_mov_b32 s14, s10 2051; GFX11-NEXT: s_mov_b32 s15, s11 2052; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX11-NEXT: s_mov_b32 s12, s6 2054; GFX11-NEXT: s_mov_b32 s13, s7 2055; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 2056; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 2057; GFX11-NEXT: s_mov_b32 s8, s4 2058; GFX11-NEXT: s_mov_b32 s9, s5 2059; GFX11-NEXT: s_waitcnt vmcnt(0) 2060; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 2061; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 2062; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 2063; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 2064; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2065; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 2066; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 2067; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2068; GFX11-NEXT: s_nop 0 2069; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2070; GFX11-NEXT: s_endpgm 2071; 2072; GFX12-LABEL: v_mul_i64: 2073; GFX12: ; %bb.0: ; %entry 2074; GFX12-NEXT: s_clause 0x1 2075; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 2076; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 2077; GFX12-NEXT: s_mov_b32 s10, -1 2078; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2079; GFX12-NEXT: s_mov_b32 s2, s10 2080; GFX12-NEXT: s_mov_b32 s3, s11 2081; GFX12-NEXT: s_mov_b32 s14, s10 2082; GFX12-NEXT: s_mov_b32 s15, s11 2083; GFX12-NEXT: s_wait_kmcnt 0x0 2084; GFX12-NEXT: s_mov_b32 s12, s6 2085; GFX12-NEXT: s_mov_b32 s13, s7 2086; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null 2087; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null 2088; GFX12-NEXT: s_mov_b32 s8, s4 2089; GFX12-NEXT: s_mov_b32 s9, s5 2090; GFX12-NEXT: s_wait_loadcnt 0x0 2091; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 2092; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 2093; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 2094; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 2095; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2096; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 2097; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 2098; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2099; GFX12-NEXT: s_nop 0 2100; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2101; GFX12-NEXT: s_endpgm 2102; 2103; EG-LABEL: v_mul_i64: 2104; EG: ; %bb.0: ; %entry 2105; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2106; EG-NEXT: TEX 1 @6 2107; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[] 2108; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1 2109; EG-NEXT: CF_END 2110; EG-NEXT: PAD 2111; EG-NEXT: Fetch clause starting at 6: 2112; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 2113; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 2114; EG-NEXT: ALU clause starting at 10: 2115; EG-NEXT: MOV T0.X, KC0[2].Z, 2116; EG-NEXT: MOV * T1.X, KC0[2].W, 2117; EG-NEXT: ALU clause starting at 12: 2118; EG-NEXT: MULHI * T0.Z, T0.X, T1.X, 2119; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y, 2120; EG-NEXT: ADD_INT T0.W, T0.Z, PS, 2121; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X, 2122; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 2123; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2124; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 2125; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2126entry: 2127 %a = load i64, ptr addrspace(1) %aptr, align 8 2128 %b = load i64, ptr addrspace(1) %bptr, align 8 2129 %mul = mul i64 %a, %b 2130 store i64 %mul, ptr addrspace(1) %out, align 8 2131 ret void 2132} 2133 2134define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { 2135; SI-LABEL: mul32_in_branch: 2136; SI: ; %bb.0: ; %entry 2137; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 2138; SI-NEXT: s_waitcnt lgkmcnt(0) 2139; SI-NEXT: s_cmp_lg_u32 s2, 0 2140; SI-NEXT: s_cbranch_scc0 .LBB15_2 2141; SI-NEXT: ; %bb.1: ; %else 2142; SI-NEXT: s_mul_i32 s6, s2, s3 2143; SI-NEXT: s_mov_b64 s[4:5], 0 2144; SI-NEXT: s_branch .LBB15_3 2145; SI-NEXT: .LBB15_2: 2146; SI-NEXT: s_mov_b64 s[4:5], -1 2147; SI-NEXT: ; implicit-def: $sgpr6 2148; SI-NEXT: .LBB15_3: ; %Flow 2149; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 2150; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 2151; SI-NEXT: s_waitcnt lgkmcnt(0) 2152; SI-NEXT: s_mov_b64 vcc, vcc 2153; SI-NEXT: s_cbranch_vccnz .LBB15_5 2154; SI-NEXT: ; %bb.4: ; %if 2155; SI-NEXT: s_mov_b32 s7, 0xf000 2156; SI-NEXT: s_mov_b32 s6, -1 2157; SI-NEXT: s_mov_b32 s4, s2 2158; SI-NEXT: s_mov_b32 s5, s3 2159; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 2160; SI-NEXT: s_branch .LBB15_6 2161; SI-NEXT: .LBB15_5: 2162; SI-NEXT: v_mov_b32_e32 v0, s6 2163; SI-NEXT: .LBB15_6: ; %endif 2164; SI-NEXT: s_mov_b32 s3, 0xf000 2165; SI-NEXT: s_mov_b32 s2, -1 2166; SI-NEXT: s_waitcnt vmcnt(0) 2167; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2168; SI-NEXT: s_endpgm 2169; 2170; VI-LABEL: mul32_in_branch: 2171; VI: ; %bb.0: ; %entry 2172; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2173; VI-NEXT: s_waitcnt lgkmcnt(0) 2174; VI-NEXT: s_cmp_lg_u32 s2, 0 2175; VI-NEXT: s_cbranch_scc0 .LBB15_2 2176; VI-NEXT: ; %bb.1: ; %else 2177; VI-NEXT: s_mul_i32 s6, s2, s3 2178; VI-NEXT: s_mov_b64 s[4:5], 0 2179; VI-NEXT: s_branch .LBB15_3 2180; VI-NEXT: .LBB15_2: 2181; VI-NEXT: s_mov_b64 s[4:5], -1 2182; VI-NEXT: ; implicit-def: $sgpr6 2183; VI-NEXT: .LBB15_3: ; %Flow 2184; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2185; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 2186; VI-NEXT: s_cbranch_vccnz .LBB15_5 2187; VI-NEXT: ; %bb.4: ; %if 2188; VI-NEXT: s_mov_b32 s7, 0xf000 2189; VI-NEXT: s_mov_b32 s6, -1 2190; VI-NEXT: s_waitcnt lgkmcnt(0) 2191; VI-NEXT: s_mov_b32 s4, s2 2192; VI-NEXT: s_mov_b32 s5, s3 2193; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 2194; VI-NEXT: s_branch .LBB15_6 2195; VI-NEXT: .LBB15_5: 2196; VI-NEXT: v_mov_b32_e32 v0, s6 2197; VI-NEXT: .LBB15_6: ; %endif 2198; VI-NEXT: s_waitcnt lgkmcnt(0) 2199; VI-NEXT: s_mov_b32 s3, 0xf000 2200; VI-NEXT: s_mov_b32 s2, -1 2201; VI-NEXT: s_waitcnt vmcnt(0) 2202; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2203; VI-NEXT: s_endpgm 2204; 2205; GFX9-LABEL: mul32_in_branch: 2206; GFX9: ; %bb.0: ; %entry 2207; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2208; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2209; GFX9-NEXT: s_cmp_lg_u32 s2, 0 2210; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 2211; GFX9-NEXT: ; %bb.1: ; %else 2212; GFX9-NEXT: s_mul_i32 s6, s2, s3 2213; GFX9-NEXT: s_mov_b64 s[4:5], 0 2214; GFX9-NEXT: s_branch .LBB15_3 2215; GFX9-NEXT: .LBB15_2: 2216; GFX9-NEXT: s_mov_b64 s[4:5], -1 2217; GFX9-NEXT: ; implicit-def: $sgpr6 2218; GFX9-NEXT: .LBB15_3: ; %Flow 2219; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2220; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 2221; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 2222; GFX9-NEXT: ; %bb.4: ; %if 2223; GFX9-NEXT: s_mov_b32 s7, 0xf000 2224; GFX9-NEXT: s_mov_b32 s6, -1 2225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2226; GFX9-NEXT: s_mov_b32 s4, s2 2227; GFX9-NEXT: s_mov_b32 s5, s3 2228; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 2229; GFX9-NEXT: s_branch .LBB15_6 2230; GFX9-NEXT: .LBB15_5: 2231; GFX9-NEXT: v_mov_b32_e32 v0, s6 2232; GFX9-NEXT: .LBB15_6: ; %endif 2233; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2234; GFX9-NEXT: s_mov_b32 s3, 0xf000 2235; GFX9-NEXT: s_mov_b32 s2, -1 2236; GFX9-NEXT: s_waitcnt vmcnt(0) 2237; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2238; GFX9-NEXT: s_endpgm 2239; 2240; GFX10-LABEL: mul32_in_branch: 2241; GFX10: ; %bb.0: ; %entry 2242; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 2243; GFX10-NEXT: s_mov_b32 s4, 0 2244; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2245; GFX10-NEXT: s_cmp_lg_u32 s2, 0 2246; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 2247; GFX10-NEXT: ; %bb.1: ; %else 2248; GFX10-NEXT: s_mul_i32 s5, s2, s3 2249; GFX10-NEXT: s_branch .LBB15_3 2250; GFX10-NEXT: .LBB15_2: 2251; GFX10-NEXT: s_mov_b32 s4, -1 2252; GFX10-NEXT: ; implicit-def: $sgpr5 2253; GFX10-NEXT: .LBB15_3: ; %Flow 2254; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 2255; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 2256; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 2257; GFX10-NEXT: ; %bb.4: ; %if 2258; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2259; GFX10-NEXT: s_mov_b32 s6, -1 2260; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2261; GFX10-NEXT: s_mov_b32 s4, s2 2262; GFX10-NEXT: s_mov_b32 s5, s3 2263; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 2264; GFX10-NEXT: s_branch .LBB15_6 2265; GFX10-NEXT: .LBB15_5: 2266; GFX10-NEXT: v_mov_b32_e32 v0, s5 2267; GFX10-NEXT: .LBB15_6: ; %endif 2268; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2269; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2270; GFX10-NEXT: s_mov_b32 s2, -1 2271; GFX10-NEXT: s_waitcnt vmcnt(0) 2272; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 2273; GFX10-NEXT: s_endpgm 2274; 2275; GFX11-LABEL: mul32_in_branch: 2276; GFX11: ; %bb.0: ; %entry 2277; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 2278; GFX11-NEXT: s_mov_b32 s4, 0 2279; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2280; GFX11-NEXT: s_cmp_lg_u32 s2, 0 2281; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 2282; GFX11-NEXT: ; %bb.1: ; %else 2283; GFX11-NEXT: s_mul_i32 s5, s2, s3 2284; GFX11-NEXT: s_branch .LBB15_3 2285; GFX11-NEXT: .LBB15_2: 2286; GFX11-NEXT: s_mov_b32 s4, -1 2287; GFX11-NEXT: ; implicit-def: $sgpr5 2288; GFX11-NEXT: .LBB15_3: ; %Flow 2289; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2290; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 2291; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 2292; GFX11-NEXT: ; %bb.4: ; %if 2293; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2294; GFX11-NEXT: s_mov_b32 s6, -1 2295; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2296; GFX11-NEXT: s_mov_b32 s4, s2 2297; GFX11-NEXT: s_mov_b32 s5, s3 2298; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2299; GFX11-NEXT: s_branch .LBB15_6 2300; GFX11-NEXT: .LBB15_5: 2301; GFX11-NEXT: v_mov_b32_e32 v0, s5 2302; GFX11-NEXT: .LBB15_6: ; %endif 2303; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2304; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2305; GFX11-NEXT: s_mov_b32 s2, -1 2306; GFX11-NEXT: s_waitcnt vmcnt(0) 2307; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2308; GFX11-NEXT: s_nop 0 2309; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2310; GFX11-NEXT: s_endpgm 2311; 2312; GFX12-LABEL: mul32_in_branch: 2313; GFX12: ; %bb.0: ; %entry 2314; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 2315; GFX12-NEXT: s_mov_b32 s4, 0 2316; GFX12-NEXT: s_wait_kmcnt 0x0 2317; GFX12-NEXT: s_cmp_lg_u32 s2, 0 2318; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 2319; GFX12-NEXT: ; %bb.1: ; %else 2320; GFX12-NEXT: s_mul_i32 s5, s2, s3 2321; GFX12-NEXT: s_branch .LBB15_3 2322; GFX12-NEXT: .LBB15_2: 2323; GFX12-NEXT: s_mov_b32 s4, -1 2324; GFX12-NEXT: ; implicit-def: $sgpr5 2325; GFX12-NEXT: .LBB15_3: ; %Flow 2326; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 2327; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 2328; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 2329; GFX12-NEXT: ; %bb.4: ; %if 2330; GFX12-NEXT: s_mov_b32 s7, 0x31016000 2331; GFX12-NEXT: s_mov_b32 s6, -1 2332; GFX12-NEXT: s_wait_kmcnt 0x0 2333; GFX12-NEXT: s_mov_b32 s4, s2 2334; GFX12-NEXT: s_mov_b32 s5, s3 2335; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2336; GFX12-NEXT: s_branch .LBB15_6 2337; GFX12-NEXT: .LBB15_5: 2338; GFX12-NEXT: v_mov_b32_e32 v0, s5 2339; GFX12-NEXT: .LBB15_6: ; %endif 2340; GFX12-NEXT: s_wait_kmcnt 0x0 2341; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2342; GFX12-NEXT: s_mov_b32 s2, -1 2343; GFX12-NEXT: s_wait_loadcnt 0x0 2344; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null 2345; GFX12-NEXT: s_nop 0 2346; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2347; GFX12-NEXT: s_endpgm 2348; 2349; EG-LABEL: mul32_in_branch: 2350; EG: ; %bb.0: ; %entry 2351; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] 2352; EG-NEXT: JUMP @3 POP:1 2353; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[] 2354; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[] 2355; EG-NEXT: JUMP @8 POP:1 2356; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] 2357; EG-NEXT: TEX 0 @12 2358; EG-NEXT: POP @8 POP:1 2359; EG-NEXT: ALU 1, @27, KC0[], KC1[] 2360; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2361; EG-NEXT: CF_END 2362; EG-NEXT: PAD 2363; EG-NEXT: Fetch clause starting at 12: 2364; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2365; EG-NEXT: ALU clause starting at 14: 2366; EG-NEXT: MOV T0.W, literal.x, 2367; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, 2368; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 2369; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2370; EG-NEXT: ALU clause starting at 18: 2371; EG-NEXT: MOV T1.W, KC0[2].W, 2372; EG-NEXT: MOV * T2.W, KC0[3].X, 2373; EG-NEXT: MOV T0.W, literal.x, 2374; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 2375; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2376; EG-NEXT: ALU clause starting at 23: 2377; EG-NEXT: MOV T1.W, KC0[2].Y, 2378; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, 2379; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2380; EG-NEXT: ALU clause starting at 26: 2381; EG-NEXT: MOV * T0.X, KC0[2].Z, 2382; EG-NEXT: ALU clause starting at 27: 2383; EG-NEXT: LSHR * T1.X, T1.W, literal.x, 2384; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2385entry: 2386 %0 = icmp eq i32 %a, 0 2387 br i1 %0, label %if, label %else 2388 2389if: 2390 %1 = load i32, ptr addrspace(1) %in 2391 br label %endif 2392 2393else: 2394 %2 = mul i32 %a, %b 2395 br label %endif 2396 2397endif: 2398 %3 = phi i32 [%1, %if], [%2, %else] 2399 store i32 %3, ptr addrspace(1) %out 2400 ret void 2401} 2402 2403define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { 2404; SI-LABEL: mul64_in_branch: 2405; SI: ; %bb.0: ; %entry 2406; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 2407; SI-NEXT: s_mov_b64 s[8:9], 0 2408; SI-NEXT: s_waitcnt lgkmcnt(0) 2409; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 2410; SI-NEXT: s_and_b64 vcc, exec, s[10:11] 2411; SI-NEXT: s_cbranch_vccz .LBB16_4 2412; SI-NEXT: ; %bb.1: ; %else 2413; SI-NEXT: v_mov_b32_e32 v0, s6 2414; SI-NEXT: v_mul_hi_u32 v0, s4, v0 2415; SI-NEXT: s_mul_i32 s7, s4, s7 2416; SI-NEXT: s_mul_i32 s5, s5, s6 2417; SI-NEXT: s_mul_i32 s4, s4, s6 2418; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 2419; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 2420; SI-NEXT: v_mov_b32_e32 v0, s4 2421; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 2422; SI-NEXT: s_cbranch_vccnz .LBB16_3 2423; SI-NEXT: .LBB16_2: ; %if 2424; SI-NEXT: s_mov_b32 s7, 0xf000 2425; SI-NEXT: s_mov_b32 s6, -1 2426; SI-NEXT: s_mov_b32 s4, s2 2427; SI-NEXT: s_mov_b32 s5, s3 2428; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2429; SI-NEXT: .LBB16_3: ; %endif 2430; SI-NEXT: s_mov_b32 s3, 0xf000 2431; SI-NEXT: s_mov_b32 s2, -1 2432; SI-NEXT: s_waitcnt vmcnt(0) 2433; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2434; SI-NEXT: s_endpgm 2435; SI-NEXT: .LBB16_4: 2436; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 2437; SI-NEXT: s_branch .LBB16_2 2438; 2439; VI-LABEL: mul64_in_branch: 2440; VI: ; %bb.0: ; %entry 2441; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 2442; VI-NEXT: s_mov_b64 s[8:9], 0 2443; VI-NEXT: s_waitcnt lgkmcnt(0) 2444; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 2445; VI-NEXT: s_cbranch_scc0 .LBB16_4 2446; VI-NEXT: ; %bb.1: ; %else 2447; VI-NEXT: v_mov_b32_e32 v0, s6 2448; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 2449; VI-NEXT: s_mul_i32 s4, s4, s7 2450; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 2451; VI-NEXT: s_mul_i32 s4, s5, s6 2452; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 2453; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 2454; VI-NEXT: s_cbranch_vccnz .LBB16_3 2455; VI-NEXT: .LBB16_2: ; %if 2456; VI-NEXT: s_mov_b32 s7, 0xf000 2457; VI-NEXT: s_mov_b32 s6, -1 2458; VI-NEXT: s_mov_b32 s4, s2 2459; VI-NEXT: s_mov_b32 s5, s3 2460; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2461; VI-NEXT: .LBB16_3: ; %endif 2462; VI-NEXT: s_mov_b32 s3, 0xf000 2463; VI-NEXT: s_mov_b32 s2, -1 2464; VI-NEXT: s_waitcnt vmcnt(0) 2465; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2466; VI-NEXT: s_endpgm 2467; VI-NEXT: .LBB16_4: 2468; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 2469; VI-NEXT: s_branch .LBB16_2 2470; 2471; GFX9-LABEL: mul64_in_branch: 2472; GFX9: ; %bb.0: ; %entry 2473; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 2474; GFX9-NEXT: s_mov_b64 s[8:9], 0 2475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2476; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 2477; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 2478; GFX9-NEXT: ; %bb.1: ; %else 2479; GFX9-NEXT: s_mul_i32 s7, s4, s7 2480; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 2481; GFX9-NEXT: s_add_i32 s7, s10, s7 2482; GFX9-NEXT: s_mul_i32 s5, s5, s6 2483; GFX9-NEXT: s_add_i32 s5, s7, s5 2484; GFX9-NEXT: s_mul_i32 s4, s4, s6 2485; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] 2486; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 2487; GFX9-NEXT: .LBB16_2: ; %if 2488; GFX9-NEXT: s_mov_b32 s7, 0xf000 2489; GFX9-NEXT: s_mov_b32 s6, -1 2490; GFX9-NEXT: s_mov_b32 s4, s2 2491; GFX9-NEXT: s_mov_b32 s5, s3 2492; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2493; GFX9-NEXT: s_branch .LBB16_5 2494; GFX9-NEXT: .LBB16_3: 2495; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 2496; GFX9-NEXT: s_branch .LBB16_2 2497; GFX9-NEXT: .LBB16_4: 2498; GFX9-NEXT: v_mov_b32_e32 v0, s4 2499; GFX9-NEXT: v_mov_b32_e32 v1, s5 2500; GFX9-NEXT: .LBB16_5: ; %endif 2501; GFX9-NEXT: s_mov_b32 s3, 0xf000 2502; GFX9-NEXT: s_mov_b32 s2, -1 2503; GFX9-NEXT: s_waitcnt vmcnt(0) 2504; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2505; GFX9-NEXT: s_endpgm 2506; 2507; GFX10-LABEL: mul64_in_branch: 2508; GFX10: ; %bb.0: ; %entry 2509; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 2510; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2511; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 2512; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 2513; GFX10-NEXT: ; %bb.1: ; %else 2514; GFX10-NEXT: s_mul_i32 s7, s4, s7 2515; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 2516; GFX10-NEXT: s_mul_i32 s5, s5, s6 2517; GFX10-NEXT: s_add_i32 s7, s8, s7 2518; GFX10-NEXT: s_mul_i32 s4, s4, s6 2519; GFX10-NEXT: s_add_i32 s5, s7, s5 2520; GFX10-NEXT: s_mov_b32 s6, 0 2521; GFX10-NEXT: s_cbranch_execnz .LBB16_4 2522; GFX10-NEXT: .LBB16_2: ; %if 2523; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2524; GFX10-NEXT: s_mov_b32 s6, -1 2525; GFX10-NEXT: s_mov_b32 s4, s2 2526; GFX10-NEXT: s_mov_b32 s5, s3 2527; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2528; GFX10-NEXT: s_branch .LBB16_5 2529; GFX10-NEXT: .LBB16_3: 2530; GFX10-NEXT: s_mov_b32 s6, -1 2531; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 2532; GFX10-NEXT: s_branch .LBB16_2 2533; GFX10-NEXT: .LBB16_4: 2534; GFX10-NEXT: v_mov_b32_e32 v0, s4 2535; GFX10-NEXT: v_mov_b32_e32 v1, s5 2536; GFX10-NEXT: .LBB16_5: ; %endif 2537; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2538; GFX10-NEXT: s_mov_b32 s2, -1 2539; GFX10-NEXT: s_waitcnt vmcnt(0) 2540; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2541; GFX10-NEXT: s_endpgm 2542; 2543; GFX11-LABEL: mul64_in_branch: 2544; GFX11: ; %bb.0: ; %entry 2545; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 2546; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2547; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 2548; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 2549; GFX11-NEXT: ; %bb.1: ; %else 2550; GFX11-NEXT: s_mul_i32 s7, s4, s7 2551; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 2552; GFX11-NEXT: s_mul_i32 s5, s5, s6 2553; GFX11-NEXT: s_add_i32 s7, s8, s7 2554; GFX11-NEXT: s_mul_i32 s4, s4, s6 2555; GFX11-NEXT: s_add_i32 s5, s7, s5 2556; GFX11-NEXT: s_mov_b32 s6, 0 2557; GFX11-NEXT: s_cbranch_execnz .LBB16_4 2558; GFX11-NEXT: .LBB16_2: ; %if 2559; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2560; GFX11-NEXT: s_mov_b32 s6, -1 2561; GFX11-NEXT: s_mov_b32 s4, s2 2562; GFX11-NEXT: s_mov_b32 s5, s3 2563; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 2564; GFX11-NEXT: s_branch .LBB16_5 2565; GFX11-NEXT: .LBB16_3: 2566; GFX11-NEXT: s_mov_b32 s6, -1 2567; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 2568; GFX11-NEXT: s_branch .LBB16_2 2569; GFX11-NEXT: .LBB16_4: 2570; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2571; GFX11-NEXT: .LBB16_5: ; %endif 2572; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2573; GFX11-NEXT: s_mov_b32 s2, -1 2574; GFX11-NEXT: s_waitcnt vmcnt(0) 2575; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2576; GFX11-NEXT: s_nop 0 2577; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2578; GFX11-NEXT: s_endpgm 2579; 2580; GFX12-LABEL: mul64_in_branch: 2581; GFX12: ; %bb.0: ; %entry 2582; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 2583; GFX12-NEXT: s_wait_kmcnt 0x0 2584; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 2585; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 2586; GFX12-NEXT: ; %bb.1: ; %else 2587; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] 2588; GFX12-NEXT: s_mov_b32 s6, 0 2589; GFX12-NEXT: s_cbranch_execnz .LBB16_4 2590; GFX12-NEXT: .LBB16_2: ; %if 2591; GFX12-NEXT: s_mov_b32 s7, 0x31016000 2592; GFX12-NEXT: s_mov_b32 s6, -1 2593; GFX12-NEXT: s_mov_b32 s4, s2 2594; GFX12-NEXT: s_mov_b32 s5, s3 2595; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null 2596; GFX12-NEXT: s_branch .LBB16_5 2597; GFX12-NEXT: .LBB16_3: 2598; GFX12-NEXT: s_mov_b32 s6, -1 2599; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 2600; GFX12-NEXT: s_branch .LBB16_2 2601; GFX12-NEXT: .LBB16_4: 2602; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2603; GFX12-NEXT: .LBB16_5: ; %endif 2604; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2605; GFX12-NEXT: s_mov_b32 s2, -1 2606; GFX12-NEXT: s_wait_loadcnt 0x0 2607; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2608; GFX12-NEXT: s_nop 0 2609; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2610; GFX12-NEXT: s_endpgm 2611; 2612; EG-LABEL: mul64_in_branch: 2613; EG: ; %bb.0: ; %entry 2614; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] 2615; EG-NEXT: JUMP @3 POP:1 2616; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[] 2617; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 2618; EG-NEXT: JUMP @8 POP:1 2619; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] 2620; EG-NEXT: TEX 0 @12 2621; EG-NEXT: POP @8 POP:1 2622; EG-NEXT: ALU 1, @35, KC0[], KC1[] 2623; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2624; EG-NEXT: CF_END 2625; EG-NEXT: PAD 2626; EG-NEXT: Fetch clause starting at 12: 2627; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 2628; EG-NEXT: ALU clause starting at 14: 2629; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, 2630; EG-NEXT: MOV * T1.W, literal.x, 2631; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 2632; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 2633; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 2634; EG-NEXT: ALU clause starting at 19: 2635; EG-NEXT: MOV T0.W, KC0[2].W, 2636; EG-NEXT: MOV * T1.W, KC0[3].Z, 2637; EG-NEXT: MOV T2.W, KC0[3].Y, 2638; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 2639; EG-NEXT: MOV T1.W, KC0[3].X, 2640; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, 2641; EG-NEXT: ADD_INT T3.W, PS, T0.X, 2642; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, 2643; EG-NEXT: ADD_INT T0.Y, PV.W, PS, 2644; EG-NEXT: MOV T1.W, literal.x, 2645; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, 2646; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2647; EG-NEXT: ALU clause starting at 31: 2648; EG-NEXT: MOV T0.W, KC0[2].Y, 2649; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 2650; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2651; EG-NEXT: ALU clause starting at 34: 2652; EG-NEXT: MOV * T0.X, KC0[2].Z, 2653; EG-NEXT: ALU clause starting at 35: 2654; EG-NEXT: LSHR * T1.X, T0.W, literal.x, 2655; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2656entry: 2657 %0 = icmp eq i64 %a, 0 2658 br i1 %0, label %if, label %else 2659 2660if: 2661 %1 = load i64, ptr addrspace(1) %in 2662 br label %endif 2663 2664else: 2665 %2 = mul i64 %a, %b 2666 br label %endif 2667 2668endif: 2669 %3 = phi i64 [%1, %if], [%2, %else] 2670 store i64 %3, ptr addrspace(1) %out 2671 ret void 2672} 2673 2674define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { 2675; SI-LABEL: s_mul_i128: 2676; SI: ; %bb.0: ; %entry 2677; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 2678; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f 2679; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2680; SI-NEXT: s_mov_b32 s3, 0xf000 2681; SI-NEXT: s_mov_b32 s2, -1 2682; SI-NEXT: s_waitcnt lgkmcnt(0) 2683; SI-NEXT: v_mov_b32_e32 v0, s6 2684; SI-NEXT: v_mul_hi_u32 v0, s8, v0 2685; SI-NEXT: v_mov_b32_e32 v1, s4 2686; SI-NEXT: v_mul_hi_u32 v1, s10, v1 2687; SI-NEXT: s_mul_i32 s7, s8, s7 2688; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 2689; SI-NEXT: s_mul_i32 s7, s10, s5 2690; SI-NEXT: s_mul_i32 s12, s9, s6 2691; SI-NEXT: s_mul_i32 s6, s8, s6 2692; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 2693; SI-NEXT: s_mul_i32 s7, s11, s4 2694; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 2695; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 2696; SI-NEXT: s_mul_i32 s7, s10, s4 2697; SI-NEXT: v_mov_b32_e32 v2, s6 2698; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 2699; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 2700; SI-NEXT: v_mov_b32_e32 v1, s8 2701; SI-NEXT: v_mul_hi_u32 v5, s4, v1 2702; SI-NEXT: v_mul_hi_u32 v1, s5, v1 2703; SI-NEXT: v_mov_b32_e32 v3, s9 2704; SI-NEXT: v_mul_hi_u32 v4, s4, v3 2705; SI-NEXT: s_mul_i32 s7, s5, s8 2706; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 2707; SI-NEXT: s_mul_i32 s6, s4, s9 2708; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 2709; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5 2710; SI-NEXT: v_mul_hi_u32 v3, s5, v3 2711; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 2712; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 2713; SI-NEXT: s_mul_i32 s5, s5, s9 2714; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc 2715; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 2716; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2717; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2718; SI-NEXT: s_mul_i32 s4, s4, s8 2719; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc 2720; SI-NEXT: v_mov_b32_e32 v0, s4 2721; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2722; SI-NEXT: s_endpgm 2723; 2724; VI-LABEL: s_mul_i128: 2725; VI: ; %bb.0: ; %entry 2726; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 2727; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 2728; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2729; VI-NEXT: v_mov_b32_e32 v5, 0 2730; VI-NEXT: s_mov_b32 s3, 0xf000 2731; VI-NEXT: s_waitcnt lgkmcnt(0) 2732; VI-NEXT: v_mov_b32_e32 v0, s6 2733; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 2734; VI-NEXT: s_mul_i32 s7, s8, s7 2735; VI-NEXT: v_mov_b32_e32 v6, s8 2736; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 2737; VI-NEXT: s_mul_i32 s12, s9, s6 2738; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0 2739; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 2740; VI-NEXT: v_mov_b32_e32 v4, v1 2741; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5] 2742; VI-NEXT: v_mov_b32_e32 v8, s4 2743; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3] 2744; VI-NEXT: v_mov_b32_e32 v3, v7 2745; VI-NEXT: v_mov_b32_e32 v7, v5 2746; VI-NEXT: v_mov_b32_e32 v8, s9 2747; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7] 2748; VI-NEXT: s_mul_i32 s8, s11, s4 2749; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2 2750; VI-NEXT: v_mov_b32_e32 v2, v5 2751; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 2752; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc 2753; VI-NEXT: s_mul_i32 s8, s10, s5 2754; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3] 2755; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6 2756; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 2757; VI-NEXT: s_mov_b32 s2, -1 2758; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2759; VI-NEXT: v_mov_b32_e32 v1, v4 2760; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2761; VI-NEXT: s_endpgm 2762; 2763; GFX9-LABEL: s_mul_i128: 2764; GFX9: ; %bb.0: ; %entry 2765; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c 2766; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c 2767; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2768; GFX9-NEXT: s_mov_b32 s7, 0xf000 2769; GFX9-NEXT: s_mov_b32 s6, -1 2770; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2771; GFX9-NEXT: s_mul_i32 s0, s12, s11 2772; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 2773; GFX9-NEXT: s_mul_i32 s2, s14, s9 2774; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 2775; GFX9-NEXT: s_add_i32 s0, s1, s0 2776; GFX9-NEXT: s_mul_i32 s1, s13, s10 2777; GFX9-NEXT: s_add_i32 s2, s3, s2 2778; GFX9-NEXT: s_mul_i32 s3, s15, s8 2779; GFX9-NEXT: s_add_i32 s0, s0, s1 2780; GFX9-NEXT: s_mul_i32 s1, s12, s10 2781; GFX9-NEXT: s_add_i32 s2, s2, s3 2782; GFX9-NEXT: s_mul_i32 s3, s14, s8 2783; GFX9-NEXT: s_add_u32 s3, s3, s1 2784; GFX9-NEXT: s_addc_u32 s2, s2, s0 2785; GFX9-NEXT: s_mul_i32 s14, s9, s12 2786; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 2787; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 2788; GFX9-NEXT: s_add_u32 s14, s14, s15 2789; GFX9-NEXT: s_mul_i32 s1, s8, s13 2790; GFX9-NEXT: s_addc_u32 s11, s11, 0 2791; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 2792; GFX9-NEXT: s_add_u32 s1, s1, s14 2793; GFX9-NEXT: s_addc_u32 s10, s10, 0 2794; GFX9-NEXT: s_add_u32 s10, s11, s10 2795; GFX9-NEXT: s_addc_u32 s11, 0, 0 2796; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 2797; GFX9-NEXT: s_mul_i32 s9, s9, s13 2798; GFX9-NEXT: s_add_u32 s9, s9, s10 2799; GFX9-NEXT: s_addc_u32 s10, s14, s11 2800; GFX9-NEXT: s_mov_b32 s0, 0 2801; GFX9-NEXT: s_add_u32 s9, s9, s3 2802; GFX9-NEXT: s_addc_u32 s10, s10, s2 2803; GFX9-NEXT: s_mul_i32 s2, s8, s12 2804; GFX9-NEXT: s_mov_b32 s3, s0 2805; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 2806; GFX9-NEXT: v_mov_b32_e32 v0, s0 2807; GFX9-NEXT: v_mov_b32_e32 v1, s1 2808; GFX9-NEXT: v_mov_b32_e32 v2, s9 2809; GFX9-NEXT: v_mov_b32_e32 v3, s10 2810; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2811; GFX9-NEXT: s_endpgm 2812; 2813; GFX10-LABEL: s_mul_i128: 2814; GFX10: ; %bb.0: ; %entry 2815; GFX10-NEXT: s_clause 0x1 2816; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 2817; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 2818; GFX10-NEXT: s_mov_b32 s2, 0 2819; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2820; GFX10-NEXT: s_mov_b32 s13, s2 2821; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2822; GFX10-NEXT: s_mul_i32 s3, s8, s7 2823; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 2824; GFX10-NEXT: s_mul_i32 s14, s10, s5 2825; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 2826; GFX10-NEXT: s_mul_i32 s12, s9, s6 2827; GFX10-NEXT: s_mul_i32 s11, s11, s4 2828; GFX10-NEXT: s_add_i32 s3, s7, s3 2829; GFX10-NEXT: s_add_i32 s7, s15, s14 2830; GFX10-NEXT: s_mul_i32 s6, s8, s6 2831; GFX10-NEXT: s_mul_i32 s10, s10, s4 2832; GFX10-NEXT: s_add_i32 s3, s3, s12 2833; GFX10-NEXT: s_add_i32 s7, s7, s11 2834; GFX10-NEXT: s_mul_i32 s19, s5, s8 2835; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 2836; GFX10-NEXT: s_add_u32 s6, s10, s6 2837; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 2838; GFX10-NEXT: s_addc_u32 s7, s7, s3 2839; GFX10-NEXT: s_mul_i32 s17, s4, s9 2840; GFX10-NEXT: s_add_u32 s3, s19, s20 2841; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 2842; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 2843; GFX10-NEXT: s_mul_i32 s5, s5, s9 2844; GFX10-NEXT: s_addc_u32 s9, s18, 0 2845; GFX10-NEXT: s_add_u32 s3, s17, s3 2846; GFX10-NEXT: s_addc_u32 s10, s16, 0 2847; GFX10-NEXT: s_mul_i32 s12, s4, s8 2848; GFX10-NEXT: s_add_u32 s4, s9, s10 2849; GFX10-NEXT: s_addc_u32 s8, 0, 0 2850; GFX10-NEXT: s_add_u32 s4, s5, s4 2851; GFX10-NEXT: s_addc_u32 s5, s21, s8 2852; GFX10-NEXT: s_add_u32 s4, s4, s6 2853; GFX10-NEXT: s_addc_u32 s5, s5, s7 2854; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2855; GFX10-NEXT: v_mov_b32_e32 v2, s4 2856; GFX10-NEXT: v_mov_b32_e32 v0, s2 2857; GFX10-NEXT: v_mov_b32_e32 v1, s3 2858; GFX10-NEXT: v_mov_b32_e32 v3, s5 2859; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2860; GFX10-NEXT: s_mov_b32 s2, -1 2861; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2862; GFX10-NEXT: s_endpgm 2863; 2864; GFX11-LABEL: s_mul_i128: 2865; GFX11: ; %bb.0: ; %entry 2866; GFX11-NEXT: s_clause 0x2 2867; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c 2868; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c 2869; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2870; GFX11-NEXT: s_mov_b32 s2, 0 2871; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2872; GFX11-NEXT: s_mov_b32 s13, s2 2873; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2874; GFX11-NEXT: s_mul_i32 s3, s8, s7 2875; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 2876; GFX11-NEXT: s_mul_i32 s14, s10, s5 2877; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 2878; GFX11-NEXT: s_mul_i32 s12, s9, s6 2879; GFX11-NEXT: s_mul_i32 s11, s11, s4 2880; GFX11-NEXT: s_add_i32 s3, s7, s3 2881; GFX11-NEXT: s_add_i32 s7, s15, s14 2882; GFX11-NEXT: s_mul_i32 s6, s8, s6 2883; GFX11-NEXT: s_mul_i32 s10, s10, s4 2884; GFX11-NEXT: s_add_i32 s3, s3, s12 2885; GFX11-NEXT: s_add_i32 s7, s7, s11 2886; GFX11-NEXT: s_mul_i32 s19, s5, s8 2887; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 2888; GFX11-NEXT: s_add_u32 s6, s10, s6 2889; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 2890; GFX11-NEXT: s_addc_u32 s7, s7, s3 2891; GFX11-NEXT: s_mul_i32 s17, s4, s9 2892; GFX11-NEXT: s_add_u32 s3, s19, s20 2893; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 2894; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 2895; GFX11-NEXT: s_mul_i32 s5, s5, s9 2896; GFX11-NEXT: s_addc_u32 s9, s18, 0 2897; GFX11-NEXT: s_add_u32 s3, s17, s3 2898; GFX11-NEXT: s_addc_u32 s10, s16, 0 2899; GFX11-NEXT: s_mul_i32 s12, s4, s8 2900; GFX11-NEXT: s_add_u32 s4, s9, s10 2901; GFX11-NEXT: s_addc_u32 s8, 0, 0 2902; GFX11-NEXT: s_add_u32 s4, s5, s4 2903; GFX11-NEXT: s_addc_u32 s5, s21, s8 2904; GFX11-NEXT: s_add_u32 s4, s4, s6 2905; GFX11-NEXT: s_addc_u32 s5, s5, s7 2906; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2907; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2908; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 2909; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 2910; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2911; GFX11-NEXT: s_mov_b32 s2, -1 2912; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 2913; GFX11-NEXT: s_nop 0 2914; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2915; GFX11-NEXT: s_endpgm 2916; 2917; GFX12-LABEL: s_mul_i128: 2918; GFX12: ; %bb.0: ; %entry 2919; GFX12-NEXT: s_clause 0x1 2920; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c 2921; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c 2922; GFX12-NEXT: s_mov_b32 s3, 0 2923; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2924; GFX12-NEXT: s_mov_b32 s15, s3 2925; GFX12-NEXT: s_mov_b32 s13, s3 2926; GFX12-NEXT: s_mov_b32 s17, s3 2927; GFX12-NEXT: s_mov_b32 s19, s3 2928; GFX12-NEXT: s_mov_b32 s24, s3 2929; GFX12-NEXT: s_wait_kmcnt 0x0 2930; GFX12-NEXT: s_mov_b32 s2, s4 2931; GFX12-NEXT: s_mov_b32 s14, s8 2932; GFX12-NEXT: s_mov_b32 s12, s9 2933; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3] 2934; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3] 2935; GFX12-NEXT: s_mov_b32 s2, s23 2936; GFX12-NEXT: s_mov_b32 s16, s5 2937; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11] 2938; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3] 2939; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9] 2940; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17] 2941; GFX12-NEXT: s_mov_b32 s2, s11 2942; GFX12-NEXT: s_mov_b32 s11, s3 2943; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5] 2944; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11] 2945; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17] 2946; GFX12-NEXT: s_mov_b32 s18, s7 2947; GFX12-NEXT: s_mov_b32 s23, s3 2948; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] 2949; GFX12-NEXT: s_mov_b32 s25, s6 2950; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3] 2951; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] 2952; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] 2953; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 2954; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2955; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2956; GFX12-NEXT: s_mov_b32 s2, -1 2957; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null 2958; GFX12-NEXT: s_nop 0 2959; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2960; GFX12-NEXT: s_endpgm 2961; 2962; EG-LABEL: s_mul_i128: 2963; EG: ; %bb.0: ; %entry 2964; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] 2965; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2966; EG-NEXT: CF_END 2967; EG-NEXT: PAD 2968; EG-NEXT: ALU clause starting at 4: 2969; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X, 2970; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X, 2971; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W, 2972; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y, 2973; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W, 2974; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X, 2975; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W, 2976; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X, 2977; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y, 2978; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W, 2979; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W, 2980; EG-NEXT: ADD_INT T2.W, T2.Y, PS, 2981; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X, 2982; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z, 2983; EG-NEXT: ADDC_UINT T3.W, PS, PV.W, 2984; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z, 2985; EG-NEXT: ADD_INT T2.X, T2.X, PS, 2986; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W, 2987; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W, 2988; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212 2989; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W, 2990; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z, 2991; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z, 2992; EG-NEXT: ADD_INT T1.Z, PV.Y, PS, 2993; EG-NEXT: ADD_INT T0.W, PV.X, T0.W, 2994; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y, 2995; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W, 2996; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS, 2997; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y, 2998; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X, 2999; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X, 3000; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122 3001; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3002; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 3003; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3004; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 3005; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 3006; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z, 3007; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W, 3008; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 3009; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3010; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W, 3011entry: 3012 %mul = mul i128 %a, %b 3013 store i128 %mul, ptr addrspace(1) %out 3014 ret void 3015} 3016 3017define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 3018; SI-LABEL: v_mul_i128: 3019; SI: ; %bb.0: ; %entry 3020; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3021; SI-NEXT: s_mov_b32 s7, 0xf000 3022; SI-NEXT: s_mov_b32 s6, 0 3023; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 3024; SI-NEXT: v_mov_b32_e32 v9, 0 3025; SI-NEXT: s_waitcnt lgkmcnt(0) 3026; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 3027; SI-NEXT: s_mov_b64 s[0:1], s[2:3] 3028; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 3029; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 3030; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 3031; SI-NEXT: s_waitcnt vmcnt(0) 3032; SI-NEXT: v_mul_lo_u32 v3, v4, v3 3033; SI-NEXT: v_mul_hi_u32 v10, v4, v2 3034; SI-NEXT: v_mul_lo_u32 v12, v6, v1 3035; SI-NEXT: v_mul_hi_u32 v13, v6, v0 3036; SI-NEXT: v_mul_lo_u32 v17, v1, v4 3037; SI-NEXT: v_mul_hi_u32 v18, v0, v4 3038; SI-NEXT: v_mul_lo_u32 v11, v5, v2 3039; SI-NEXT: v_mul_lo_u32 v7, v7, v0 3040; SI-NEXT: v_mul_hi_u32 v16, v1, v4 3041; SI-NEXT: v_mul_lo_u32 v15, v0, v5 3042; SI-NEXT: v_mul_hi_u32 v14, v0, v5 3043; SI-NEXT: v_mul_hi_u32 v19, v1, v5 3044; SI-NEXT: v_mul_lo_u32 v5, v1, v5 3045; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 3046; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 3047; SI-NEXT: v_mul_lo_u32 v2, v4, v2 3048; SI-NEXT: v_mul_lo_u32 v6, v6, v0 3049; SI-NEXT: v_mul_lo_u32 v0, v0, v4 3050; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 3051; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc 3052; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 3053; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 3054; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 3055; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc 3056; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 3057; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 3058; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 3059; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc 3060; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 3061; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc 3062; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 3063; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 3064; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 3065; SI-NEXT: s_endpgm 3066; 3067; VI-LABEL: v_mul_i128: 3068; VI: ; %bb.0: ; %entry 3069; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 3070; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 3071; VI-NEXT: v_mov_b32_e32 v11, 0 3072; VI-NEXT: s_waitcnt lgkmcnt(0) 3073; VI-NEXT: v_mov_b32_e32 v1, s1 3074; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3075; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3076; VI-NEXT: v_mov_b32_e32 v3, s3 3077; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 3078; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc 3079; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3080; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] 3081; VI-NEXT: s_waitcnt vmcnt(0) 3082; VI-NEXT: v_mul_lo_u32 v10, v4, v3 3083; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 3084; VI-NEXT: v_mul_lo_u32 v14, v5, v2 3085; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 3086; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v10 3087; VI-NEXT: v_mov_b32_e32 v10, v3 3088; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] 3089; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v14 3090; VI-NEXT: v_mov_b32_e32 v10, v4 3091; VI-NEXT: v_mov_b32_e32 v4, v11 3092; VI-NEXT: v_mul_lo_u32 v7, v7, v0 3093; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] 3094; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] 3095; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v13 3096; VI-NEXT: v_mov_b32_e32 v0, v4 3097; VI-NEXT: v_mul_lo_u32 v11, v6, v1 3098; VI-NEXT: v_add_u32_e32 v6, vcc, v10, v0 3099; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc 3100; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 3101; VI-NEXT: v_add_u32_e32 v5, vcc, v11, v13 3102; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 3103; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 3104; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] 3105; VI-NEXT: s_endpgm 3106; 3107; GFX9-LABEL: v_mul_i128: 3108; GFX9: ; %bb.0: ; %entry 3109; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 3110; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 3111; GFX9-NEXT: v_mov_b32_e32 v10, 0 3112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3113; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] 3114; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] 3115; GFX9-NEXT: s_waitcnt vmcnt(0) 3116; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 3117; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 3118; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 3119; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] 3120; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 3121; GFX9-NEXT: v_mul_lo_u32 v16, v7, v0 3122; GFX9-NEXT: v_mov_b32_e32 v7, v12 3123; GFX9-NEXT: v_mov_b32_e32 v12, v10 3124; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] 3125; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 3126; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] 3127; GFX9-NEXT: v_mov_b32_e32 v0, v10 3128; GFX9-NEXT: v_mul_lo_u32 v4, v6, v1 3129; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v0 3130; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc 3131; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 3132; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4 3133; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 3134; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc 3135; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] 3136; GFX9-NEXT: s_endpgm 3137; 3138; GFX10-LABEL: v_mul_i128: 3139; GFX10: ; %bb.0: ; %entry 3140; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 3141; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 3142; GFX10-NEXT: v_mov_b32_e32 v10, 0 3143; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3144; GFX10-NEXT: s_clause 0x1 3145; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] 3146; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] 3147; GFX10-NEXT: s_waitcnt vmcnt(0) 3148; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 3149; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2 3150; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 3151; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] 3152; GFX10-NEXT: v_mov_b32_e32 v14, v12 3153; GFX10-NEXT: v_mov_b32_e32 v12, v10 3154; GFX10-NEXT: v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12] 3155; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 3156; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 3157; GFX10-NEXT: v_mul_lo_u32 v12, v6, v1 3158; GFX10-NEXT: v_mov_b32_e32 v4, v10 3159; GFX10-NEXT: v_add3_u32 v3, v3, v11, v15 3160; GFX10-NEXT: v_add_co_u32 v10, s0, v14, v4 3161; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s0, 0, 0, s0 3162; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] 3163; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11] 3164; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12 3165; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 3166; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 3167; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] 3168; GFX10-NEXT: s_endpgm 3169; 3170; GFX11-LABEL: v_mul_i128: 3171; GFX11: ; %bb.0: ; %entry 3172; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c 3173; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 3174; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3175; GFX11-NEXT: s_clause 0x1 3176; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] 3177; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] 3178; GFX11-NEXT: s_waitcnt vmcnt(0) 3179; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 3180; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2 3181; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 3182; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3183; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] 3184; GFX11-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10 3185; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3186; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] 3187; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v4, v2, 0 3188; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 3189; GFX11-NEXT: v_mov_b32_e32 v2, v10 3190; GFX11-NEXT: v_mul_lo_u32 v10, v7, v0 3191; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 3192; GFX11-NEXT: v_add3_u32 v12, v12, v3, v14 3193; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 3194; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3195; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 3196; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12] 3197; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3198; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] 3199; GFX11-NEXT: v_add3_u32 v0, v10, v14, v4 3200; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3201; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 3202; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo 3203; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] 3204; GFX11-NEXT: s_nop 0 3205; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3206; GFX11-NEXT: s_endpgm 3207; 3208; GFX12-LABEL: v_mul_i128: 3209; GFX12: ; %bb.0: ; %entry 3210; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c 3211; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 3212; GFX12-NEXT: s_wait_kmcnt 0x0 3213; GFX12-NEXT: s_clause 0x1 3214; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1] 3215; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3] 3216; GFX12-NEXT: s_wait_loadcnt 0x0 3217; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 3218; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2 3219; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0 3220; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3221; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] 3222; GFX12-NEXT: v_mov_b32_e32 v14, v12 3223; GFX12-NEXT: v_mov_b32_e32 v12, v10 3224; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 3225; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12] 3226; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3 3227; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 3228; GFX12-NEXT: v_mul_lo_u32 v12, v6, v1 3229; GFX12-NEXT: v_mov_b32_e32 v4, v10 3230; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3231; GFX12-NEXT: v_add3_u32 v3, v3, v11, v15 3232; GFX12-NEXT: v_add_co_u32 v10, s0, v14, v4 3233; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3234; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 3235; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] 3236; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3237; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] 3238; GFX12-NEXT: v_add3_u32 v3, v7, v3, v12 3239; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3240; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 3241; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 3242; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] 3243; GFX12-NEXT: s_nop 0 3244; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 3245; GFX12-NEXT: s_endpgm 3246; 3247; EG-LABEL: v_mul_i128: 3248; EG: ; %bb.0: ; %entry 3249; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 3250; EG-NEXT: TEX 1 @6 3251; EG-NEXT: ALU 41, @14, KC0[], KC1[] 3252; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 3253; EG-NEXT: CF_END 3254; EG-NEXT: PAD 3255; EG-NEXT: Fetch clause starting at 6: 3256; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1 3257; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 3258; EG-NEXT: ALU clause starting at 10: 3259; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 3260; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 3261; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 3262; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 3263; EG-NEXT: ALU clause starting at 14: 3264; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y, 3265; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y, 3266; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X, 3267; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z, 3268; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X, 3269; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y, 3270; EG-NEXT: MULHI * T3.W, T2.Z, T0.X, 3271; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y, 3272; EG-NEXT: MULHI * T4.X, T2.X, T0.Z, 3273; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X, 3274; EG-NEXT: MULHI * T4.Y, T0.X, T2.X, 3275; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 3276; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y, 3277; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y, 3278; EG-NEXT: ADDC_UINT T5.W, PS, PV.W, 3279; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W, 3280; EG-NEXT: ADD_INT T4.X, T4.X, PS, 3281; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z, 3282; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W, 3283; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z, 3284; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X, 3285; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z, 3286; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z, 3287; EG-NEXT: ADD_INT T2.Z, PV.Y, PS, 3288; EG-NEXT: ADD_INT T0.W, PV.X, T3.X, 3289; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z, 3290; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W, 3291; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS, 3292; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y, 3293; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X, 3294; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X, 3295; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y, 3296; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3297; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 3298; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3299; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 3300; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 3301; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z, 3302; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W, 3303; EG-NEXT: LSHR T1.X, T1.X, literal.x, 3304; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X, 3305; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3306entry: 3307 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3308 %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid 3309 %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 3310 %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 3311 %a = load i128, ptr addrspace(1) %gep.a 3312 %b = load i128, ptr addrspace(1) %gep.b 3313 %mul = mul i128 %a, %b 3314 store i128 %mul, ptr addrspace(1) %gep.out 3315 ret void 3316} 3317 3318define i32 @mul_pow2_plus_1(i32 %val) { 3319; SI-LABEL: mul_pow2_plus_1: 3320; SI: ; %bb.0: 3321; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3322; SI-NEXT: v_mul_lo_u32 v0, v0, 9 3323; SI-NEXT: s_setpc_b64 s[30:31] 3324; 3325; VI-LABEL: mul_pow2_plus_1: 3326; VI: ; %bb.0: 3327; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3328; VI-NEXT: v_mul_lo_u32 v0, v0, 9 3329; VI-NEXT: s_setpc_b64 s[30:31] 3330; 3331; GFX9-LABEL: mul_pow2_plus_1: 3332; GFX9: ; %bb.0: 3333; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3334; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3335; GFX9-NEXT: s_setpc_b64 s[30:31] 3336; 3337; GFX10-LABEL: mul_pow2_plus_1: 3338; GFX10: ; %bb.0: 3339; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3340; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3341; GFX10-NEXT: s_setpc_b64 s[30:31] 3342; 3343; GFX11-LABEL: mul_pow2_plus_1: 3344; GFX11: ; %bb.0: 3345; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3346; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3347; GFX11-NEXT: s_setpc_b64 s[30:31] 3348; 3349; GFX12-LABEL: mul_pow2_plus_1: 3350; GFX12: ; %bb.0: 3351; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3352; GFX12-NEXT: s_wait_expcnt 0x0 3353; GFX12-NEXT: s_wait_samplecnt 0x0 3354; GFX12-NEXT: s_wait_bvhcnt 0x0 3355; GFX12-NEXT: s_wait_kmcnt 0x0 3356; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3357; GFX12-NEXT: s_setpc_b64 s[30:31] 3358; 3359; EG-LABEL: mul_pow2_plus_1: 3360; EG: ; %bb.0: 3361; EG-NEXT: CF_END 3362; EG-NEXT: PAD 3363 %mul = mul i32 %val, 9 3364 ret i32 %mul 3365} 3366 3367declare i32 @llvm.amdgcn.workitem.id.x() #1 3368 3369attributes #0 = { nounwind } 3370attributes #1 = { nounwind readnone} 3371