1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 8; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s 9 10; mul24 and mad24 are affected 11 12define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 13; SI-LABEL: test_mul_v2i32: 14; SI: ; %bb.0: ; %entry 15; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 16; SI-NEXT: s_mov_b32 s7, 0xf000 17; SI-NEXT: s_mov_b32 s6, -1 18; SI-NEXT: s_mov_b32 s10, s6 19; SI-NEXT: s_mov_b32 s11, s7 20; SI-NEXT: s_waitcnt lgkmcnt(0) 21; SI-NEXT: s_mov_b32 s8, s2 22; SI-NEXT: s_mov_b32 s9, s3 23; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 24; SI-NEXT: s_mov_b32 s4, s0 25; SI-NEXT: s_mov_b32 s5, s1 26; SI-NEXT: s_waitcnt vmcnt(0) 27; SI-NEXT: v_mul_lo_u32 v1, v1, v3 28; SI-NEXT: v_mul_lo_u32 v0, v0, v2 29; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 30; SI-NEXT: s_endpgm 31; 32; VI-LABEL: test_mul_v2i32: 33; VI: ; %bb.0: ; %entry 34; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 35; VI-NEXT: s_mov_b32 s7, 0xf000 36; VI-NEXT: s_mov_b32 s6, -1 37; VI-NEXT: s_mov_b32 s10, s6 38; VI-NEXT: s_mov_b32 s11, s7 39; VI-NEXT: s_waitcnt lgkmcnt(0) 40; VI-NEXT: s_mov_b32 s8, s2 41; VI-NEXT: s_mov_b32 s9, s3 42; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 43; VI-NEXT: s_mov_b32 s4, s0 44; VI-NEXT: s_mov_b32 s5, s1 45; VI-NEXT: s_waitcnt vmcnt(0) 46; VI-NEXT: v_mul_lo_u32 v1, v1, v3 47; VI-NEXT: v_mul_lo_u32 v0, v0, v2 48; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 49; VI-NEXT: s_endpgm 50; 51; GFX9-LABEL: test_mul_v2i32: 52; GFX9: ; %bb.0: ; %entry 53; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 54; GFX9-NEXT: s_mov_b32 s7, 0xf000 55; GFX9-NEXT: s_mov_b32 s6, -1 56; GFX9-NEXT: s_mov_b32 s10, s6 57; GFX9-NEXT: s_mov_b32 s11, s7 58; GFX9-NEXT: s_waitcnt lgkmcnt(0) 59; GFX9-NEXT: s_mov_b32 s8, s2 60; GFX9-NEXT: s_mov_b32 s9, s3 61; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 62; GFX9-NEXT: s_mov_b32 s4, s0 63; GFX9-NEXT: s_mov_b32 s5, s1 64; GFX9-NEXT: s_waitcnt vmcnt(0) 65; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 66; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 67; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 68; GFX9-NEXT: s_endpgm 69; 70; GFX10-LABEL: test_mul_v2i32: 71; GFX10: ; %bb.0: ; %entry 72; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 73; GFX10-NEXT: s_mov_b32 s6, -1 74; GFX10-NEXT: s_mov_b32 s7, 0x31016000 75; GFX10-NEXT: s_mov_b32 s10, s6 76; GFX10-NEXT: s_mov_b32 s11, s7 77; GFX10-NEXT: s_waitcnt lgkmcnt(0) 78; GFX10-NEXT: s_mov_b32 s8, s2 79; GFX10-NEXT: s_mov_b32 s9, s3 80; GFX10-NEXT: s_mov_b32 s4, s0 81; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 82; GFX10-NEXT: s_mov_b32 s5, s1 83; GFX10-NEXT: s_waitcnt vmcnt(0) 84; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 85; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 86; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 87; GFX10-NEXT: s_endpgm 88; 89; GFX11-LABEL: test_mul_v2i32: 90; GFX11: ; %bb.0: ; %entry 91; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 92; GFX11-NEXT: s_mov_b32 s6, -1 93; GFX11-NEXT: s_mov_b32 s7, 0x31016000 94; GFX11-NEXT: s_mov_b32 s10, s6 95; GFX11-NEXT: s_mov_b32 s11, s7 96; GFX11-NEXT: s_waitcnt lgkmcnt(0) 97; GFX11-NEXT: s_mov_b32 s8, s2 98; GFX11-NEXT: s_mov_b32 s9, s3 99; GFX11-NEXT: s_mov_b32 s4, s0 100; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 101; GFX11-NEXT: s_mov_b32 s5, s1 102; GFX11-NEXT: s_waitcnt vmcnt(0) 103; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 104; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 105; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 106; GFX11-NEXT: s_endpgm 107; 108; GFX12-LABEL: test_mul_v2i32: 109; GFX12: ; %bb.0: ; %entry 110; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 111; GFX12-NEXT: s_mov_b32 s6, -1 112; GFX12-NEXT: s_mov_b32 s7, 0x31016000 113; GFX12-NEXT: s_mov_b32 s10, s6 114; GFX12-NEXT: s_mov_b32 s11, s7 115; GFX12-NEXT: s_wait_kmcnt 0x0 116; GFX12-NEXT: s_mov_b32 s8, s2 117; GFX12-NEXT: s_mov_b32 s9, s3 118; GFX12-NEXT: s_mov_b32 s4, s0 119; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null 120; GFX12-NEXT: s_mov_b32 s5, s1 121; GFX12-NEXT: s_wait_loadcnt 0x0 122; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 123; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 124; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 125; GFX12-NEXT: s_endpgm 126; 127; EG-LABEL: test_mul_v2i32: 128; EG: ; %bb.0: ; %entry 129; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 130; EG-NEXT: TEX 0 @6 131; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 132; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 133; EG-NEXT: CF_END 134; EG-NEXT: PAD 135; EG-NEXT: Fetch clause starting at 6: 136; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 137; EG-NEXT: ALU clause starting at 8: 138; EG-NEXT: MOV * T0.X, KC0[2].Z, 139; EG-NEXT: ALU clause starting at 9: 140; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W, 141; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 142; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z, 143; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 144entry: 145 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 146 %a = load <2 x i32>, ptr addrspace(1) %in 147 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 148 %result = mul <2 x i32> %a, %b 149 store <2 x i32> %result, ptr addrspace(1) %out 150 ret void 151} 152 153define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 154; SI-LABEL: v_mul_v4i32: 155; SI: ; %bb.0: ; %entry 156; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 157; SI-NEXT: s_mov_b32 s7, 0xf000 158; SI-NEXT: s_mov_b32 s6, -1 159; SI-NEXT: s_mov_b32 s10, s6 160; SI-NEXT: s_mov_b32 s11, s7 161; SI-NEXT: s_waitcnt lgkmcnt(0) 162; SI-NEXT: s_mov_b32 s8, s2 163; SI-NEXT: s_mov_b32 s9, s3 164; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 165; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 166; SI-NEXT: s_mov_b32 s4, s0 167; SI-NEXT: s_mov_b32 s5, s1 168; SI-NEXT: s_waitcnt vmcnt(0) 169; SI-NEXT: v_mul_lo_u32 v3, v3, v7 170; SI-NEXT: v_mul_lo_u32 v2, v2, v6 171; SI-NEXT: v_mul_lo_u32 v1, v1, v5 172; SI-NEXT: v_mul_lo_u32 v0, v0, v4 173; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 174; SI-NEXT: s_endpgm 175; 176; VI-LABEL: v_mul_v4i32: 177; VI: ; %bb.0: ; %entry 178; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 179; VI-NEXT: s_mov_b32 s7, 0xf000 180; VI-NEXT: s_mov_b32 s6, -1 181; VI-NEXT: s_mov_b32 s10, s6 182; VI-NEXT: s_mov_b32 s11, s7 183; VI-NEXT: s_waitcnt lgkmcnt(0) 184; VI-NEXT: s_mov_b32 s8, s2 185; VI-NEXT: s_mov_b32 s9, s3 186; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 187; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 188; VI-NEXT: s_mov_b32 s4, s0 189; VI-NEXT: s_mov_b32 s5, s1 190; VI-NEXT: s_waitcnt vmcnt(0) 191; VI-NEXT: v_mul_lo_u32 v3, v3, v7 192; VI-NEXT: v_mul_lo_u32 v2, v2, v6 193; VI-NEXT: v_mul_lo_u32 v1, v1, v5 194; VI-NEXT: v_mul_lo_u32 v0, v0, v4 195; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 196; VI-NEXT: s_endpgm 197; 198; GFX9-LABEL: v_mul_v4i32: 199; GFX9: ; %bb.0: ; %entry 200; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 201; GFX9-NEXT: s_mov_b32 s7, 0xf000 202; GFX9-NEXT: s_mov_b32 s6, -1 203; GFX9-NEXT: s_mov_b32 s10, s6 204; GFX9-NEXT: s_mov_b32 s11, s7 205; GFX9-NEXT: s_waitcnt lgkmcnt(0) 206; GFX9-NEXT: s_mov_b32 s8, s2 207; GFX9-NEXT: s_mov_b32 s9, s3 208; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 209; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 210; GFX9-NEXT: s_mov_b32 s4, s0 211; GFX9-NEXT: s_mov_b32 s5, s1 212; GFX9-NEXT: s_waitcnt vmcnt(0) 213; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 214; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 215; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 216; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 217; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 218; GFX9-NEXT: s_endpgm 219; 220; GFX10-LABEL: v_mul_v4i32: 221; GFX10: ; %bb.0: ; %entry 222; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 223; GFX10-NEXT: s_mov_b32 s6, -1 224; GFX10-NEXT: s_mov_b32 s7, 0x31016000 225; GFX10-NEXT: s_mov_b32 s10, s6 226; GFX10-NEXT: s_mov_b32 s11, s7 227; GFX10-NEXT: s_waitcnt lgkmcnt(0) 228; GFX10-NEXT: s_mov_b32 s8, s2 229; GFX10-NEXT: s_mov_b32 s9, s3 230; GFX10-NEXT: s_clause 0x1 231; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 232; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 233; GFX10-NEXT: s_mov_b32 s4, s0 234; GFX10-NEXT: s_mov_b32 s5, s1 235; GFX10-NEXT: s_waitcnt vmcnt(0) 236; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 237; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 238; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 239; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 240; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 241; GFX10-NEXT: s_endpgm 242; 243; GFX11-LABEL: v_mul_v4i32: 244; GFX11: ; %bb.0: ; %entry 245; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 246; GFX11-NEXT: s_mov_b32 s6, -1 247; GFX11-NEXT: s_mov_b32 s7, 0x31016000 248; GFX11-NEXT: s_mov_b32 s10, s6 249; GFX11-NEXT: s_mov_b32 s11, s7 250; GFX11-NEXT: s_waitcnt lgkmcnt(0) 251; GFX11-NEXT: s_mov_b32 s8, s2 252; GFX11-NEXT: s_mov_b32 s9, s3 253; GFX11-NEXT: s_clause 0x1 254; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 255; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16 256; GFX11-NEXT: s_mov_b32 s4, s0 257; GFX11-NEXT: s_mov_b32 s5, s1 258; GFX11-NEXT: s_waitcnt vmcnt(0) 259; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7 260; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6 261; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 262; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 263; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 264; GFX11-NEXT: s_endpgm 265; 266; GFX12-LABEL: v_mul_v4i32: 267; GFX12: ; %bb.0: ; %entry 268; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 269; GFX12-NEXT: s_mov_b32 s6, -1 270; GFX12-NEXT: s_mov_b32 s7, 0x31016000 271; GFX12-NEXT: s_mov_b32 s10, s6 272; GFX12-NEXT: s_mov_b32 s11, s7 273; GFX12-NEXT: s_wait_kmcnt 0x0 274; GFX12-NEXT: s_mov_b32 s8, s2 275; GFX12-NEXT: s_mov_b32 s9, s3 276; GFX12-NEXT: s_clause 0x1 277; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null 278; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 279; GFX12-NEXT: s_mov_b32 s4, s0 280; GFX12-NEXT: s_mov_b32 s5, s1 281; GFX12-NEXT: s_wait_loadcnt 0x0 282; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7 283; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6 284; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5 285; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4 286; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null 287; GFX12-NEXT: s_endpgm 288; 289; EG-LABEL: v_mul_v4i32: 290; EG: ; %bb.0: ; %entry 291; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 292; EG-NEXT: TEX 1 @6 293; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 294; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 295; EG-NEXT: CF_END 296; EG-NEXT: PAD 297; EG-NEXT: Fetch clause starting at 6: 298; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 299; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 300; EG-NEXT: ALU clause starting at 10: 301; EG-NEXT: MOV * T0.X, KC0[2].Z, 302; EG-NEXT: ALU clause starting at 11: 303; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W, 304; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z, 305; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y, 306; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 307; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 308; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 309entry: 310 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 311 %a = load <4 x i32>, ptr addrspace(1) %in 312 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 313 %result = mul <4 x i32> %a, %b 314 store <4 x i32> %result, ptr addrspace(1) %out 315 ret void 316} 317 318define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { 319; SI-LABEL: s_trunc_i64_mul_to_i32: 320; SI: ; %bb.0: ; %entry 321; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 322; SI-NEXT: s_waitcnt lgkmcnt(0) 323; SI-NEXT: s_load_dword s3, s[4:5], 0xd 324; SI-NEXT: s_mov_b32 s7, 0xf000 325; SI-NEXT: s_mov_b32 s6, -1 326; SI-NEXT: s_mov_b32 s4, s0 327; SI-NEXT: s_waitcnt lgkmcnt(0) 328; SI-NEXT: s_mul_i32 s0, s3, s2 329; SI-NEXT: s_mov_b32 s5, s1 330; SI-NEXT: v_mov_b32_e32 v0, s0 331; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 332; SI-NEXT: s_endpgm 333; 334; VI-LABEL: s_trunc_i64_mul_to_i32: 335; VI: ; %bb.0: ; %entry 336; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 337; VI-NEXT: s_waitcnt lgkmcnt(0) 338; VI-NEXT: s_load_dword s3, s[4:5], 0x34 339; VI-NEXT: s_mov_b32 s7, 0xf000 340; VI-NEXT: s_mov_b32 s6, -1 341; VI-NEXT: s_mov_b32 s4, s0 342; VI-NEXT: s_waitcnt lgkmcnt(0) 343; VI-NEXT: s_mul_i32 s0, s3, s2 344; VI-NEXT: s_mov_b32 s5, s1 345; VI-NEXT: v_mov_b32_e32 v0, s0 346; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 347; VI-NEXT: s_endpgm 348; 349; GFX9-LABEL: s_trunc_i64_mul_to_i32: 350; GFX9: ; %bb.0: ; %entry 351; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 353; GFX9-NEXT: s_load_dword s3, s[4:5], 0x34 354; GFX9-NEXT: ; kill: killed $sgpr4_sgpr5 355; GFX9-NEXT: s_mov_b32 s7, 0xf000 356; GFX9-NEXT: s_mov_b32 s6, -1 357; GFX9-NEXT: s_mov_b32 s4, s0 358; GFX9-NEXT: s_waitcnt lgkmcnt(0) 359; GFX9-NEXT: s_mul_i32 s0, s3, s2 360; GFX9-NEXT: s_mov_b32 s5, s1 361; GFX9-NEXT: v_mov_b32_e32 v0, s0 362; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 363; GFX9-NEXT: s_endpgm 364; 365; GFX10-LABEL: s_trunc_i64_mul_to_i32: 366; GFX10: ; %bb.0: ; %entry 367; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 368; GFX10-NEXT: s_waitcnt lgkmcnt(0) 369; GFX10-NEXT: s_load_dword s3, s[4:5], 0x34 370; GFX10-NEXT: ; kill: killed $sgpr4_sgpr5 371; GFX10-NEXT: s_waitcnt lgkmcnt(0) 372; GFX10-NEXT: s_mul_i32 s2, s3, s2 373; GFX10-NEXT: s_mov_b32 s3, 0x31016000 374; GFX10-NEXT: v_mov_b32_e32 v0, s2 375; GFX10-NEXT: s_mov_b32 s2, -1 376; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 377; GFX10-NEXT: s_endpgm 378; 379; GFX11-LABEL: s_trunc_i64_mul_to_i32: 380; GFX11: ; %bb.0: ; %entry 381; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 382; GFX11-NEXT: s_waitcnt lgkmcnt(0) 383; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x34 384; GFX11-NEXT: s_waitcnt lgkmcnt(0) 385; GFX11-NEXT: s_mul_i32 s2, s3, s2 386; GFX11-NEXT: s_mov_b32 s3, 0x31016000 387; GFX11-NEXT: v_mov_b32_e32 v0, s2 388; GFX11-NEXT: s_mov_b32 s2, -1 389; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 390; GFX11-NEXT: s_endpgm 391; 392; GFX12-LABEL: s_trunc_i64_mul_to_i32: 393; GFX12: ; %bb.0: ; %entry 394; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 395; GFX12-NEXT: s_wait_kmcnt 0x0 396; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x34 397; GFX12-NEXT: s_wait_kmcnt 0x0 398; GFX12-NEXT: s_mul_i32 s2, s3, s2 399; GFX12-NEXT: s_mov_b32 s3, 0x31016000 400; GFX12-NEXT: v_mov_b32_e32 v0, s2 401; GFX12-NEXT: s_mov_b32 s2, -1 402; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null 403; GFX12-NEXT: s_endpgm 404; 405; EG-LABEL: s_trunc_i64_mul_to_i32: 406; EG: ; %bb.0: ; %entry 407; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 408; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 409; EG-NEXT: CF_END 410; EG-NEXT: PAD 411; EG-NEXT: ALU clause starting at 4: 412; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 413; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 414; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W, 415entry: 416 %mul = mul i64 %b, %a 417 %trunc = trunc i64 %mul to i32 418 store i32 %trunc, ptr addrspace(1) %out, align 8 419 ret void 420} 421 422define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 423; SI-LABEL: v_trunc_i64_mul_to_i32: 424; SI: ; %bb.0: ; %entry 425; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 426; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 427; SI-NEXT: s_mov_b32 s7, 0xf000 428; SI-NEXT: s_mov_b32 s6, -1 429; SI-NEXT: s_mov_b32 s14, s6 430; SI-NEXT: s_waitcnt lgkmcnt(0) 431; SI-NEXT: s_mov_b32 s12, s2 432; SI-NEXT: s_mov_b32 s13, s3 433; SI-NEXT: s_mov_b32 s15, s7 434; SI-NEXT: s_mov_b32 s10, s6 435; SI-NEXT: s_mov_b32 s11, s7 436; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 437; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 438; SI-NEXT: s_mov_b32 s4, s0 439; SI-NEXT: s_mov_b32 s5, s1 440; SI-NEXT: s_waitcnt vmcnt(0) 441; SI-NEXT: v_mul_lo_u32 v0, v1, v0 442; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 443; SI-NEXT: s_endpgm 444; 445; VI-LABEL: v_trunc_i64_mul_to_i32: 446; VI: ; %bb.0: ; %entry 447; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 448; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 449; VI-NEXT: s_mov_b32 s7, 0xf000 450; VI-NEXT: s_mov_b32 s6, -1 451; VI-NEXT: s_mov_b32 s14, s6 452; VI-NEXT: s_waitcnt lgkmcnt(0) 453; VI-NEXT: s_mov_b32 s12, s2 454; VI-NEXT: s_mov_b32 s13, s3 455; VI-NEXT: s_mov_b32 s15, s7 456; VI-NEXT: s_mov_b32 s10, s6 457; VI-NEXT: s_mov_b32 s11, s7 458; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 459; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 460; VI-NEXT: s_mov_b32 s4, s0 461; VI-NEXT: s_mov_b32 s5, s1 462; VI-NEXT: s_waitcnt vmcnt(0) 463; VI-NEXT: v_mul_lo_u32 v0, v1, v0 464; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 465; VI-NEXT: s_endpgm 466; 467; GFX9-LABEL: v_trunc_i64_mul_to_i32: 468; GFX9: ; %bb.0: ; %entry 469; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 470; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 471; GFX9-NEXT: s_mov_b32 s7, 0xf000 472; GFX9-NEXT: s_mov_b32 s6, -1 473; GFX9-NEXT: s_mov_b32 s14, s6 474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 475; GFX9-NEXT: s_mov_b32 s12, s2 476; GFX9-NEXT: s_mov_b32 s13, s3 477; GFX9-NEXT: s_mov_b32 s15, s7 478; GFX9-NEXT: s_mov_b32 s10, s6 479; GFX9-NEXT: s_mov_b32 s11, s7 480; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 481; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 482; GFX9-NEXT: s_mov_b32 s4, s0 483; GFX9-NEXT: s_mov_b32 s5, s1 484; GFX9-NEXT: s_waitcnt vmcnt(0) 485; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 486; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 487; GFX9-NEXT: s_endpgm 488; 489; GFX10-LABEL: v_trunc_i64_mul_to_i32: 490; GFX10: ; %bb.0: ; %entry 491; GFX10-NEXT: s_clause 0x1 492; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 493; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 494; GFX10-NEXT: s_mov_b32 s6, -1 495; GFX10-NEXT: s_mov_b32 s7, 0x31016000 496; GFX10-NEXT: s_mov_b32 s14, s6 497; GFX10-NEXT: s_mov_b32 s15, s7 498; GFX10-NEXT: s_mov_b32 s10, s6 499; GFX10-NEXT: s_mov_b32 s11, s7 500; GFX10-NEXT: s_waitcnt lgkmcnt(0) 501; GFX10-NEXT: s_mov_b32 s12, s2 502; GFX10-NEXT: s_mov_b32 s13, s3 503; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 504; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 505; GFX10-NEXT: s_mov_b32 s4, s0 506; GFX10-NEXT: s_mov_b32 s5, s1 507; GFX10-NEXT: s_waitcnt vmcnt(0) 508; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 509; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 510; GFX10-NEXT: s_endpgm 511; 512; GFX11-LABEL: v_trunc_i64_mul_to_i32: 513; GFX11: ; %bb.0: ; %entry 514; GFX11-NEXT: s_clause 0x1 515; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 516; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 517; GFX11-NEXT: s_mov_b32 s10, -1 518; GFX11-NEXT: s_mov_b32 s11, 0x31016000 519; GFX11-NEXT: s_mov_b32 s14, s10 520; GFX11-NEXT: s_mov_b32 s15, s11 521; GFX11-NEXT: s_mov_b32 s6, s10 522; GFX11-NEXT: s_mov_b32 s7, s11 523; GFX11-NEXT: s_waitcnt lgkmcnt(0) 524; GFX11-NEXT: s_mov_b32 s12, s2 525; GFX11-NEXT: s_mov_b32 s13, s3 526; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 527; GFX11-NEXT: buffer_load_b32 v1, off, s[4:7], 0 528; GFX11-NEXT: s_mov_b32 s8, s0 529; GFX11-NEXT: s_mov_b32 s9, s1 530; GFX11-NEXT: s_waitcnt vmcnt(0) 531; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 532; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 533; GFX11-NEXT: s_endpgm 534; 535; GFX12-LABEL: v_trunc_i64_mul_to_i32: 536; GFX12: ; %bb.0: ; %entry 537; GFX12-NEXT: s_clause 0x1 538; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 539; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 540; GFX12-NEXT: s_mov_b32 s10, -1 541; GFX12-NEXT: s_mov_b32 s11, 0x31016000 542; GFX12-NEXT: s_mov_b32 s14, s10 543; GFX12-NEXT: s_mov_b32 s15, s11 544; GFX12-NEXT: s_mov_b32 s6, s10 545; GFX12-NEXT: s_mov_b32 s7, s11 546; GFX12-NEXT: s_wait_kmcnt 0x0 547; GFX12-NEXT: s_mov_b32 s12, s2 548; GFX12-NEXT: s_mov_b32 s13, s3 549; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null 550; GFX12-NEXT: buffer_load_b32 v1, off, s[4:7], null 551; GFX12-NEXT: s_mov_b32 s8, s0 552; GFX12-NEXT: s_mov_b32 s9, s1 553; GFX12-NEXT: s_wait_loadcnt 0x0 554; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0 555; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null 556; GFX12-NEXT: s_endpgm 557; 558; EG-LABEL: v_trunc_i64_mul_to_i32: 559; EG: ; %bb.0: ; %entry 560; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 561; EG-NEXT: TEX 1 @6 562; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 563; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 564; EG-NEXT: CF_END 565; EG-NEXT: PAD 566; EG-NEXT: Fetch clause starting at 6: 567; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 568; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 569; EG-NEXT: ALU clause starting at 10: 570; EG-NEXT: MOV T0.X, KC0[2].Z, 571; EG-NEXT: MOV * T1.X, KC0[2].W, 572; EG-NEXT: ALU clause starting at 12: 573; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 574; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 575; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 576entry: 577 %a = load i64, ptr addrspace(1) %aptr, align 8 578 %b = load i64, ptr addrspace(1) %bptr, align 8 579 %mul = mul i64 %b, %a 580 %trunc = trunc i64 %mul to i32 581 store i32 %trunc, ptr addrspace(1) %out, align 8 582 ret void 583} 584 585; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top 586; 32-bits of both arguments are sign bits. 587 588define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { 589; SI-LABEL: mul64_sext_c: 590; SI: ; %bb.0: ; %entry 591; SI-NEXT: s_load_dword s6, s[4:5], 0xb 592; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 593; SI-NEXT: v_mov_b32_e32 v0, 0x50 594; SI-NEXT: s_mov_b32 s3, 0xf000 595; SI-NEXT: s_mov_b32 s2, -1 596; SI-NEXT: s_waitcnt lgkmcnt(0) 597; SI-NEXT: v_mul_hi_i32 v1, s6, v0 598; SI-NEXT: s_mulk_i32 s6, 0x50 599; SI-NEXT: v_mov_b32_e32 v0, s6 600; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 601; SI-NEXT: s_endpgm 602; 603; VI-LABEL: mul64_sext_c: 604; VI: ; %bb.0: ; %entry 605; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 606; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 607; VI-NEXT: v_mov_b32_e32 v0, 0x50 608; VI-NEXT: s_waitcnt lgkmcnt(0) 609; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 610; VI-NEXT: s_mov_b32 s3, 0xf000 611; VI-NEXT: s_mov_b32 s2, -1 612; VI-NEXT: s_nop 2 613; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 614; VI-NEXT: s_endpgm 615; 616; GFX9-LABEL: mul64_sext_c: 617; GFX9: ; %bb.0: ; %entry 618; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 619; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 620; GFX9-NEXT: s_mov_b32 s3, 0xf000 621; GFX9-NEXT: s_mov_b32 s2, -1 622; GFX9-NEXT: s_waitcnt lgkmcnt(0) 623; GFX9-NEXT: s_mul_hi_i32 s4, s6, 0x50 624; GFX9-NEXT: s_mulk_i32 s6, 0x50 625; GFX9-NEXT: v_mov_b32_e32 v0, s6 626; GFX9-NEXT: v_mov_b32_e32 v1, s4 627; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 628; GFX9-NEXT: s_endpgm 629; 630; GFX10-LABEL: mul64_sext_c: 631; GFX10: ; %bb.0: ; %entry 632; GFX10-NEXT: s_clause 0x1 633; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 634; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 635; GFX10-NEXT: s_waitcnt lgkmcnt(0) 636; GFX10-NEXT: s_mul_i32 s3, s2, 0x50 637; GFX10-NEXT: s_mul_hi_i32 s2, s2, 0x50 638; GFX10-NEXT: v_mov_b32_e32 v0, s3 639; GFX10-NEXT: v_mov_b32_e32 v1, s2 640; GFX10-NEXT: s_mov_b32 s3, 0x31016000 641; GFX10-NEXT: s_mov_b32 s2, -1 642; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 643; GFX10-NEXT: s_endpgm 644; 645; GFX11-LABEL: mul64_sext_c: 646; GFX11: ; %bb.0: ; %entry 647; GFX11-NEXT: s_clause 0x1 648; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 649; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 650; GFX11-NEXT: s_waitcnt lgkmcnt(0) 651; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 652; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 653; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 654; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 655; GFX11-NEXT: s_mov_b32 s3, 0x31016000 656; GFX11-NEXT: s_mov_b32 s2, -1 657; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 658; GFX11-NEXT: s_endpgm 659; 660; GFX12-LABEL: mul64_sext_c: 661; GFX12: ; %bb.0: ; %entry 662; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 663; GFX12-NEXT: s_wait_kmcnt 0x0 664; GFX12-NEXT: s_ashr_i32 s3, s2, 31 665; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 666; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 667; GFX12-NEXT: s_mov_b32 s3, 0x31016000 668; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 669; GFX12-NEXT: s_mov_b32 s2, -1 670; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 671; GFX12-NEXT: s_endpgm 672; 673; EG-LABEL: mul64_sext_c: 674; EG: ; %bb.0: ; %entry 675; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 676; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 677; EG-NEXT: CF_END 678; EG-NEXT: PAD 679; EG-NEXT: ALU clause starting at 4: 680; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x, 681; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 682; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 683; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 684; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 685entry: 686 %0 = sext i32 %in to i64 687 %1 = mul i64 %0, 80 688 store i64 %1, ptr addrspace(1) %out 689 ret void 690} 691 692define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { 693; SI-LABEL: mul64_zext_c: 694; SI: ; %bb.0: ; %entry 695; SI-NEXT: s_load_dword s6, s[4:5], 0xb 696; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 697; SI-NEXT: v_mov_b32_e32 v0, 0x50 698; SI-NEXT: s_mov_b32 s3, 0xf000 699; SI-NEXT: s_mov_b32 s2, -1 700; SI-NEXT: s_waitcnt lgkmcnt(0) 701; SI-NEXT: v_mul_hi_u32 v1, s6, v0 702; SI-NEXT: s_mulk_i32 s6, 0x50 703; SI-NEXT: v_mov_b32_e32 v0, s6 704; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 705; SI-NEXT: s_endpgm 706; 707; VI-LABEL: mul64_zext_c: 708; VI: ; %bb.0: ; %entry 709; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 710; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 711; VI-NEXT: v_mov_b32_e32 v0, 0x50 712; VI-NEXT: s_waitcnt lgkmcnt(0) 713; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0 714; VI-NEXT: s_mov_b32 s3, 0xf000 715; VI-NEXT: s_mov_b32 s2, -1 716; VI-NEXT: s_nop 2 717; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 718; VI-NEXT: s_endpgm 719; 720; GFX9-LABEL: mul64_zext_c: 721; GFX9: ; %bb.0: ; %entry 722; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 723; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 724; GFX9-NEXT: s_mov_b32 s3, 0xf000 725; GFX9-NEXT: s_mov_b32 s2, -1 726; GFX9-NEXT: s_waitcnt lgkmcnt(0) 727; GFX9-NEXT: s_mul_hi_u32 s4, s6, 0x50 728; GFX9-NEXT: s_mulk_i32 s6, 0x50 729; GFX9-NEXT: v_mov_b32_e32 v0, s6 730; GFX9-NEXT: v_mov_b32_e32 v1, s4 731; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 732; GFX9-NEXT: s_endpgm 733; 734; GFX10-LABEL: mul64_zext_c: 735; GFX10: ; %bb.0: ; %entry 736; GFX10-NEXT: s_clause 0x1 737; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 738; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 739; GFX10-NEXT: s_waitcnt lgkmcnt(0) 740; GFX10-NEXT: s_mul_i32 s3, s2, 0x50 741; GFX10-NEXT: s_mul_hi_u32 s2, s2, 0x50 742; GFX10-NEXT: v_mov_b32_e32 v0, s3 743; GFX10-NEXT: v_mov_b32_e32 v1, s2 744; GFX10-NEXT: s_mov_b32 s3, 0x31016000 745; GFX10-NEXT: s_mov_b32 s2, -1 746; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 747; GFX10-NEXT: s_endpgm 748; 749; GFX11-LABEL: mul64_zext_c: 750; GFX11: ; %bb.0: ; %entry 751; GFX11-NEXT: s_clause 0x1 752; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 753; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 754; GFX11-NEXT: s_waitcnt lgkmcnt(0) 755; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 756; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50 757; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 758; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 759; GFX11-NEXT: s_mov_b32 s3, 0x31016000 760; GFX11-NEXT: s_mov_b32 s2, -1 761; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 762; GFX11-NEXT: s_endpgm 763; 764; GFX12-LABEL: mul64_zext_c: 765; GFX12: ; %bb.0: ; %entry 766; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 767; GFX12-NEXT: s_mov_b32 s3, 0 768; GFX12-NEXT: s_wait_kmcnt 0x0 769; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 770; GFX12-NEXT: s_mov_b32 s3, 0x31016000 771; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 772; GFX12-NEXT: s_mov_b32 s2, -1 773; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 774; GFX12-NEXT: s_endpgm 775; 776; EG-LABEL: mul64_zext_c: 777; EG: ; %bb.0: ; %entry 778; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 779; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 780; EG-NEXT: CF_END 781; EG-NEXT: PAD 782; EG-NEXT: ALU clause starting at 4: 783; EG-NEXT: MULHI * T0.Y, KC0[2].Z, literal.x, 784; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 785; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 786; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 787; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 788entry: 789 %0 = zext i32 %in to i64 790 %1 = mul i64 %0, 80 791 store i64 %1, ptr addrspace(1) %out 792 ret void 793} 794 795define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 796; SI-LABEL: v_mul64_sext_c: 797; SI: ; %bb.0: ; %entry 798; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 799; SI-NEXT: s_mov_b32 s7, 0xf000 800; SI-NEXT: s_mov_b32 s6, -1 801; SI-NEXT: s_mov_b32 s10, s6 802; SI-NEXT: s_mov_b32 s11, s7 803; SI-NEXT: s_waitcnt lgkmcnt(0) 804; SI-NEXT: s_mov_b32 s8, s2 805; SI-NEXT: s_mov_b32 s9, s3 806; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 807; SI-NEXT: s_movk_i32 s2, 0x50 808; SI-NEXT: s_mov_b32 s4, s0 809; SI-NEXT: s_mov_b32 s5, s1 810; SI-NEXT: s_waitcnt vmcnt(0) 811; SI-NEXT: v_mul_hi_i32 v1, v0, s2 812; SI-NEXT: v_mul_lo_u32 v0, v0, s2 813; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 814; SI-NEXT: s_endpgm 815; 816; VI-LABEL: v_mul64_sext_c: 817; VI: ; %bb.0: ; %entry 818; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 819; VI-NEXT: s_mov_b32 s7, 0xf000 820; VI-NEXT: s_mov_b32 s6, -1 821; VI-NEXT: s_mov_b32 s10, s6 822; VI-NEXT: s_mov_b32 s11, s7 823; VI-NEXT: s_waitcnt lgkmcnt(0) 824; VI-NEXT: s_mov_b32 s8, s2 825; VI-NEXT: s_mov_b32 s9, s3 826; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 827; VI-NEXT: s_movk_i32 s2, 0x50 828; VI-NEXT: s_mov_b32 s4, s0 829; VI-NEXT: s_mov_b32 s5, s1 830; VI-NEXT: s_waitcnt vmcnt(0) 831; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0 832; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 833; VI-NEXT: s_endpgm 834; 835; GFX9-LABEL: v_mul64_sext_c: 836; GFX9: ; %bb.0: ; %entry 837; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 838; GFX9-NEXT: s_mov_b32 s7, 0xf000 839; GFX9-NEXT: s_mov_b32 s6, -1 840; GFX9-NEXT: s_mov_b32 s10, s6 841; GFX9-NEXT: s_mov_b32 s11, s7 842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 843; GFX9-NEXT: s_mov_b32 s8, s2 844; GFX9-NEXT: s_mov_b32 s9, s3 845; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 846; GFX9-NEXT: s_movk_i32 s2, 0x50 847; GFX9-NEXT: s_mov_b32 s4, s0 848; GFX9-NEXT: s_mov_b32 s5, s1 849; GFX9-NEXT: s_waitcnt vmcnt(0) 850; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 851; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 852; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 853; GFX9-NEXT: s_endpgm 854; 855; GFX10-LABEL: v_mul64_sext_c: 856; GFX10: ; %bb.0: ; %entry 857; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 858; GFX10-NEXT: s_mov_b32 s6, -1 859; GFX10-NEXT: s_mov_b32 s7, 0x31016000 860; GFX10-NEXT: s_mov_b32 s10, s6 861; GFX10-NEXT: s_mov_b32 s11, s7 862; GFX10-NEXT: s_waitcnt lgkmcnt(0) 863; GFX10-NEXT: s_mov_b32 s8, s2 864; GFX10-NEXT: s_mov_b32 s9, s3 865; GFX10-NEXT: s_mov_b32 s4, s0 866; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 867; GFX10-NEXT: s_mov_b32 s5, s1 868; GFX10-NEXT: s_waitcnt vmcnt(0) 869; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 870; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 871; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 872; GFX10-NEXT: s_endpgm 873; 874; GFX11-LABEL: v_mul64_sext_c: 875; GFX11: ; %bb.0: ; %entry 876; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 877; GFX11-NEXT: s_mov_b32 s6, -1 878; GFX11-NEXT: s_mov_b32 s7, 0x31016000 879; GFX11-NEXT: s_mov_b32 s10, s6 880; GFX11-NEXT: s_mov_b32 s11, s7 881; GFX11-NEXT: s_waitcnt lgkmcnt(0) 882; GFX11-NEXT: s_mov_b32 s8, s2 883; GFX11-NEXT: s_mov_b32 s9, s3 884; GFX11-NEXT: s_mov_b32 s4, s0 885; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 886; GFX11-NEXT: s_mov_b32 s5, s1 887; GFX11-NEXT: s_waitcnt vmcnt(0) 888; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 889; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 890; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 891; GFX11-NEXT: s_endpgm 892; 893; GFX12-LABEL: v_mul64_sext_c: 894; GFX12: ; %bb.0: ; %entry 895; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 896; GFX12-NEXT: s_mov_b32 s6, -1 897; GFX12-NEXT: s_mov_b32 s7, 0x31016000 898; GFX12-NEXT: s_mov_b32 s10, s6 899; GFX12-NEXT: s_mov_b32 s11, s7 900; GFX12-NEXT: s_wait_kmcnt 0x0 901; GFX12-NEXT: s_mov_b32 s8, s2 902; GFX12-NEXT: s_mov_b32 s9, s3 903; GFX12-NEXT: s_mov_b32 s4, s0 904; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 905; GFX12-NEXT: s_mov_b32 s5, s1 906; GFX12-NEXT: s_wait_loadcnt 0x0 907; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0 908; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 909; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 910; GFX12-NEXT: s_endpgm 911; 912; EG-LABEL: v_mul64_sext_c: 913; EG: ; %bb.0: ; %entry 914; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 915; EG-NEXT: TEX 0 @6 916; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 917; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 918; EG-NEXT: CF_END 919; EG-NEXT: PAD 920; EG-NEXT: Fetch clause starting at 6: 921; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 922; EG-NEXT: ALU clause starting at 8: 923; EG-NEXT: MOV * T0.X, KC0[2].Z, 924; EG-NEXT: ALU clause starting at 9: 925; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 926; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 927; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 928; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 929; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 930entry: 931 %val = load i32, ptr addrspace(1) %in, align 4 932 %ext = sext i32 %val to i64 933 %mul = mul i64 %ext, 80 934 store i64 %mul, ptr addrspace(1) %out, align 8 935 ret void 936} 937 938define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 939; SI-LABEL: v_mul64_zext_c: 940; SI: ; %bb.0: ; %entry 941; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 942; SI-NEXT: s_mov_b32 s7, 0xf000 943; SI-NEXT: s_mov_b32 s6, -1 944; SI-NEXT: s_mov_b32 s10, s6 945; SI-NEXT: s_mov_b32 s11, s7 946; SI-NEXT: s_waitcnt lgkmcnt(0) 947; SI-NEXT: s_mov_b32 s8, s2 948; SI-NEXT: s_mov_b32 s9, s3 949; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 950; SI-NEXT: s_movk_i32 s2, 0x50 951; SI-NEXT: s_mov_b32 s4, s0 952; SI-NEXT: s_mov_b32 s5, s1 953; SI-NEXT: s_waitcnt vmcnt(0) 954; SI-NEXT: v_mul_hi_u32 v1, v0, s2 955; SI-NEXT: v_mul_lo_u32 v0, v0, s2 956; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 957; SI-NEXT: s_endpgm 958; 959; VI-LABEL: v_mul64_zext_c: 960; VI: ; %bb.0: ; %entry 961; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 962; VI-NEXT: s_mov_b32 s7, 0xf000 963; VI-NEXT: s_mov_b32 s6, -1 964; VI-NEXT: s_mov_b32 s10, s6 965; VI-NEXT: s_mov_b32 s11, s7 966; VI-NEXT: s_waitcnt lgkmcnt(0) 967; VI-NEXT: s_mov_b32 s8, s2 968; VI-NEXT: s_mov_b32 s9, s3 969; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 970; VI-NEXT: s_movk_i32 s2, 0x50 971; VI-NEXT: s_mov_b32 s4, s0 972; VI-NEXT: s_mov_b32 s5, s1 973; VI-NEXT: s_waitcnt vmcnt(0) 974; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0 975; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 976; VI-NEXT: s_endpgm 977; 978; GFX9-LABEL: v_mul64_zext_c: 979; GFX9: ; %bb.0: ; %entry 980; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 981; GFX9-NEXT: s_mov_b32 s7, 0xf000 982; GFX9-NEXT: s_mov_b32 s6, -1 983; GFX9-NEXT: s_mov_b32 s10, s6 984; GFX9-NEXT: s_mov_b32 s11, s7 985; GFX9-NEXT: s_waitcnt lgkmcnt(0) 986; GFX9-NEXT: s_mov_b32 s8, s2 987; GFX9-NEXT: s_mov_b32 s9, s3 988; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 989; GFX9-NEXT: s_movk_i32 s2, 0x50 990; GFX9-NEXT: s_mov_b32 s4, s0 991; GFX9-NEXT: s_mov_b32 s5, s1 992; GFX9-NEXT: s_waitcnt vmcnt(0) 993; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2 994; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 995; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 996; GFX9-NEXT: s_endpgm 997; 998; GFX10-LABEL: v_mul64_zext_c: 999; GFX10: ; %bb.0: ; %entry 1000; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1001; GFX10-NEXT: s_mov_b32 s6, -1 1002; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1003; GFX10-NEXT: s_mov_b32 s10, s6 1004; GFX10-NEXT: s_mov_b32 s11, s7 1005; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX10-NEXT: s_mov_b32 s8, s2 1007; GFX10-NEXT: s_mov_b32 s9, s3 1008; GFX10-NEXT: s_mov_b32 s4, s0 1009; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 1010; GFX10-NEXT: s_mov_b32 s5, s1 1011; GFX10-NEXT: s_waitcnt vmcnt(0) 1012; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0 1013; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 1014; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1015; GFX10-NEXT: s_endpgm 1016; 1017; GFX11-LABEL: v_mul64_zext_c: 1018; GFX11: ; %bb.0: ; %entry 1019; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1020; GFX11-NEXT: s_mov_b32 s6, -1 1021; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1022; GFX11-NEXT: s_mov_b32 s10, s6 1023; GFX11-NEXT: s_mov_b32 s11, s7 1024; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX11-NEXT: s_mov_b32 s8, s2 1026; GFX11-NEXT: s_mov_b32 s9, s3 1027; GFX11-NEXT: s_mov_b32 s4, s0 1028; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 1029; GFX11-NEXT: s_mov_b32 s5, s1 1030; GFX11-NEXT: s_waitcnt vmcnt(0) 1031; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0 1032; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 1033; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1034; GFX11-NEXT: s_endpgm 1035; 1036; GFX12-LABEL: v_mul64_zext_c: 1037; GFX12: ; %bb.0: ; %entry 1038; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1039; GFX12-NEXT: s_mov_b32 s6, -1 1040; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1041; GFX12-NEXT: s_mov_b32 s10, s6 1042; GFX12-NEXT: s_mov_b32 s11, s7 1043; GFX12-NEXT: s_wait_kmcnt 0x0 1044; GFX12-NEXT: s_mov_b32 s8, s2 1045; GFX12-NEXT: s_mov_b32 s9, s3 1046; GFX12-NEXT: s_mov_b32 s4, s0 1047; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 1048; GFX12-NEXT: s_mov_b32 s5, s1 1049; GFX12-NEXT: s_wait_loadcnt 0x0 1050; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0 1051; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0 1052; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 1053; GFX12-NEXT: s_endpgm 1054; 1055; EG-LABEL: v_mul64_zext_c: 1056; EG: ; %bb.0: ; %entry 1057; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1058; EG-NEXT: TEX 0 @6 1059; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1060; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1061; EG-NEXT: CF_END 1062; EG-NEXT: PAD 1063; EG-NEXT: Fetch clause starting at 6: 1064; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1065; EG-NEXT: ALU clause starting at 8: 1066; EG-NEXT: MOV * T0.X, KC0[2].Z, 1067; EG-NEXT: ALU clause starting at 9: 1068; EG-NEXT: MULHI * T0.Y, T0.X, literal.x, 1069; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 1070; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1071; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 1072; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 1073entry: 1074 %val = load i32, ptr addrspace(1) %in, align 4 1075 %ext = zext i32 %val to i64 1076 %mul = mul i64 %ext, 80 1077 store i64 %mul, ptr addrspace(1) %out, align 8 1078 ret void 1079} 1080 1081define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1082; SI-LABEL: v_mul64_sext_inline_imm: 1083; SI: ; %bb.0: ; %entry 1084; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1085; SI-NEXT: s_mov_b32 s7, 0xf000 1086; SI-NEXT: s_mov_b32 s6, -1 1087; SI-NEXT: s_mov_b32 s10, s6 1088; SI-NEXT: s_mov_b32 s11, s7 1089; SI-NEXT: s_waitcnt lgkmcnt(0) 1090; SI-NEXT: s_mov_b32 s8, s2 1091; SI-NEXT: s_mov_b32 s9, s3 1092; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1093; SI-NEXT: s_mov_b32 s4, s0 1094; SI-NEXT: s_mov_b32 s5, s1 1095; SI-NEXT: s_waitcnt vmcnt(0) 1096; SI-NEXT: v_mul_hi_i32 v1, v0, 9 1097; SI-NEXT: v_mul_lo_u32 v0, v0, 9 1098; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1099; SI-NEXT: s_endpgm 1100; 1101; VI-LABEL: v_mul64_sext_inline_imm: 1102; VI: ; %bb.0: ; %entry 1103; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1104; VI-NEXT: s_mov_b32 s7, 0xf000 1105; VI-NEXT: s_mov_b32 s6, -1 1106; VI-NEXT: s_mov_b32 s10, s6 1107; VI-NEXT: s_mov_b32 s11, s7 1108; VI-NEXT: s_waitcnt lgkmcnt(0) 1109; VI-NEXT: s_mov_b32 s8, s2 1110; VI-NEXT: s_mov_b32 s9, s3 1111; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1112; VI-NEXT: s_mov_b32 s4, s0 1113; VI-NEXT: s_mov_b32 s5, s1 1114; VI-NEXT: s_waitcnt vmcnt(0) 1115; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0 1116; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1117; VI-NEXT: s_endpgm 1118; 1119; GFX9-LABEL: v_mul64_sext_inline_imm: 1120; GFX9: ; %bb.0: ; %entry 1121; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1122; GFX9-NEXT: s_mov_b32 s7, 0xf000 1123; GFX9-NEXT: s_mov_b32 s6, -1 1124; GFX9-NEXT: s_mov_b32 s10, s6 1125; GFX9-NEXT: s_mov_b32 s11, s7 1126; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1127; GFX9-NEXT: s_mov_b32 s8, s2 1128; GFX9-NEXT: s_mov_b32 s9, s3 1129; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 1130; GFX9-NEXT: s_mov_b32 s4, s0 1131; GFX9-NEXT: s_mov_b32 s5, s1 1132; GFX9-NEXT: s_waitcnt vmcnt(0) 1133; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 1134; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 1135; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1136; GFX9-NEXT: s_endpgm 1137; 1138; GFX10-LABEL: v_mul64_sext_inline_imm: 1139; GFX10: ; %bb.0: ; %entry 1140; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1141; GFX10-NEXT: s_mov_b32 s6, -1 1142; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1143; GFX10-NEXT: s_mov_b32 s10, s6 1144; GFX10-NEXT: s_mov_b32 s11, s7 1145; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1146; GFX10-NEXT: s_mov_b32 s8, s2 1147; GFX10-NEXT: s_mov_b32 s9, s3 1148; GFX10-NEXT: s_mov_b32 s4, s0 1149; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 1150; GFX10-NEXT: s_mov_b32 s5, s1 1151; GFX10-NEXT: s_waitcnt vmcnt(0) 1152; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 1153; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 1154; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1155; GFX10-NEXT: s_endpgm 1156; 1157; GFX11-LABEL: v_mul64_sext_inline_imm: 1158; GFX11: ; %bb.0: ; %entry 1159; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1160; GFX11-NEXT: s_mov_b32 s6, -1 1161; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1162; GFX11-NEXT: s_mov_b32 s10, s6 1163; GFX11-NEXT: s_mov_b32 s11, s7 1164; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX11-NEXT: s_mov_b32 s8, s2 1166; GFX11-NEXT: s_mov_b32 s9, s3 1167; GFX11-NEXT: s_mov_b32 s4, s0 1168; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 1169; GFX11-NEXT: s_mov_b32 s5, s1 1170; GFX11-NEXT: s_waitcnt vmcnt(0) 1171; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 1172; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 1173; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1174; GFX11-NEXT: s_endpgm 1175; 1176; GFX12-LABEL: v_mul64_sext_inline_imm: 1177; GFX12: ; %bb.0: ; %entry 1178; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1179; GFX12-NEXT: s_mov_b32 s6, -1 1180; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1181; GFX12-NEXT: s_mov_b32 s10, s6 1182; GFX12-NEXT: s_mov_b32 s11, s7 1183; GFX12-NEXT: s_wait_kmcnt 0x0 1184; GFX12-NEXT: s_mov_b32 s8, s2 1185; GFX12-NEXT: s_mov_b32 s9, s3 1186; GFX12-NEXT: s_mov_b32 s4, s0 1187; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null 1188; GFX12-NEXT: s_mov_b32 s5, s1 1189; GFX12-NEXT: s_wait_loadcnt 0x0 1190; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0 1191; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0 1192; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null 1193; GFX12-NEXT: s_endpgm 1194; 1195; EG-LABEL: v_mul64_sext_inline_imm: 1196; EG: ; %bb.0: ; %entry 1197; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1198; EG-NEXT: TEX 0 @6 1199; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1200; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1201; EG-NEXT: CF_END 1202; EG-NEXT: PAD 1203; EG-NEXT: Fetch clause starting at 6: 1204; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1205; EG-NEXT: ALU clause starting at 8: 1206; EG-NEXT: MOV * T0.X, KC0[2].Z, 1207; EG-NEXT: ALU clause starting at 9: 1208; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 1209; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1210; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1211; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 1212; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) 1213entry: 1214 %val = load i32, ptr addrspace(1) %in, align 4 1215 %ext = sext i32 %val to i64 1216 %mul = mul i64 %ext, 9 1217 store i64 %mul, ptr addrspace(1) %out, align 8 1218 ret void 1219} 1220 1221define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { 1222; SI-LABEL: s_mul_i32: 1223; SI: ; %bb.0: ; %entry 1224; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1225; SI-NEXT: s_load_dword s7, s[4:5], 0x1c 1226; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1227; SI-NEXT: s_mov_b32 s3, 0xf000 1228; SI-NEXT: s_mov_b32 s2, -1 1229; SI-NEXT: s_waitcnt lgkmcnt(0) 1230; SI-NEXT: s_mul_i32 s4, s6, s7 1231; SI-NEXT: v_mov_b32_e32 v0, s4 1232; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1233; SI-NEXT: s_endpgm 1234; 1235; VI-LABEL: s_mul_i32: 1236; VI: ; %bb.0: ; %entry 1237; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1238; VI-NEXT: s_load_dword s7, s[4:5], 0x70 1239; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1240; VI-NEXT: s_mov_b32 s3, 0xf000 1241; VI-NEXT: s_mov_b32 s2, -1 1242; VI-NEXT: s_waitcnt lgkmcnt(0) 1243; VI-NEXT: s_mul_i32 s4, s6, s7 1244; VI-NEXT: v_mov_b32_e32 v0, s4 1245; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1246; VI-NEXT: s_endpgm 1247; 1248; GFX9-LABEL: s_mul_i32: 1249; GFX9: ; %bb.0: ; %entry 1250; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c 1251; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70 1252; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1253; GFX9-NEXT: s_mov_b32 s3, 0xf000 1254; GFX9-NEXT: s_mov_b32 s2, -1 1255; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX9-NEXT: s_mul_i32 s4, s6, s7 1257; GFX9-NEXT: v_mov_b32_e32 v0, s4 1258; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1259; GFX9-NEXT: s_endpgm 1260; 1261; GFX10-LABEL: s_mul_i32: 1262; GFX10: ; %bb.0: ; %entry 1263; GFX10-NEXT: s_clause 0x2 1264; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c 1265; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 1266; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1267; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX10-NEXT: s_mul_i32 s2, s2, s3 1269; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1270; GFX10-NEXT: v_mov_b32_e32 v0, s2 1271; GFX10-NEXT: s_mov_b32 s2, -1 1272; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 1273; GFX10-NEXT: s_endpgm 1274; 1275; GFX11-LABEL: s_mul_i32: 1276; GFX11: ; %bb.0: ; %entry 1277; GFX11-NEXT: s_clause 0x2 1278; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c 1279; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 1280; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1281; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1282; GFX11-NEXT: s_mul_i32 s2, s2, s3 1283; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1284; GFX11-NEXT: v_mov_b32_e32 v0, s2 1285; GFX11-NEXT: s_mov_b32 s2, -1 1286; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1287; GFX11-NEXT: s_endpgm 1288; 1289; GFX12-LABEL: s_mul_i32: 1290; GFX12: ; %bb.0: ; %entry 1291; GFX12-NEXT: s_clause 0x2 1292; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x4c 1293; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x70 1294; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1295; GFX12-NEXT: s_wait_kmcnt 0x0 1296; GFX12-NEXT: s_mul_i32 s2, s2, s3 1297; GFX12-NEXT: s_mov_b32 s3, 0x31016000 1298; GFX12-NEXT: v_mov_b32_e32 v0, s2 1299; GFX12-NEXT: s_mov_b32 s2, -1 1300; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null 1301; GFX12-NEXT: s_endpgm 1302; 1303; EG-LABEL: s_mul_i32: 1304; EG: ; %bb.0: ; %entry 1305; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1306; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1307; EG-NEXT: CF_END 1308; EG-NEXT: PAD 1309; EG-NEXT: ALU clause starting at 4: 1310; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1311; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1312; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W, 1313entry: 1314 %mul = mul i32 %a, %b 1315 store i32 %mul, ptr addrspace(1) %out, align 4 1316 ret void 1317} 1318 1319define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1320; SI-LABEL: v_mul_i32: 1321; SI: ; %bb.0: ; %entry 1322; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1323; SI-NEXT: s_mov_b32 s7, 0xf000 1324; SI-NEXT: s_mov_b32 s6, -1 1325; SI-NEXT: s_mov_b32 s10, s6 1326; SI-NEXT: s_mov_b32 s11, s7 1327; SI-NEXT: s_waitcnt lgkmcnt(0) 1328; SI-NEXT: s_mov_b32 s8, s2 1329; SI-NEXT: s_mov_b32 s9, s3 1330; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1331; SI-NEXT: s_mov_b32 s4, s0 1332; SI-NEXT: s_mov_b32 s5, s1 1333; SI-NEXT: s_waitcnt vmcnt(0) 1334; SI-NEXT: v_mul_lo_u32 v0, v0, v1 1335; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1336; SI-NEXT: s_endpgm 1337; 1338; VI-LABEL: v_mul_i32: 1339; VI: ; %bb.0: ; %entry 1340; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1341; VI-NEXT: s_mov_b32 s7, 0xf000 1342; VI-NEXT: s_mov_b32 s6, -1 1343; VI-NEXT: s_mov_b32 s10, s6 1344; VI-NEXT: s_mov_b32 s11, s7 1345; VI-NEXT: s_waitcnt lgkmcnt(0) 1346; VI-NEXT: s_mov_b32 s8, s2 1347; VI-NEXT: s_mov_b32 s9, s3 1348; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1349; VI-NEXT: s_mov_b32 s4, s0 1350; VI-NEXT: s_mov_b32 s5, s1 1351; VI-NEXT: s_waitcnt vmcnt(0) 1352; VI-NEXT: v_mul_lo_u32 v0, v0, v1 1353; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1354; VI-NEXT: s_endpgm 1355; 1356; GFX9-LABEL: v_mul_i32: 1357; GFX9: ; %bb.0: ; %entry 1358; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1359; GFX9-NEXT: s_mov_b32 s7, 0xf000 1360; GFX9-NEXT: s_mov_b32 s6, -1 1361; GFX9-NEXT: s_mov_b32 s10, s6 1362; GFX9-NEXT: s_mov_b32 s11, s7 1363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX9-NEXT: s_mov_b32 s8, s2 1365; GFX9-NEXT: s_mov_b32 s9, s3 1366; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1367; GFX9-NEXT: s_mov_b32 s4, s0 1368; GFX9-NEXT: s_mov_b32 s5, s1 1369; GFX9-NEXT: s_waitcnt vmcnt(0) 1370; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 1371; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1372; GFX9-NEXT: s_endpgm 1373; 1374; GFX10-LABEL: v_mul_i32: 1375; GFX10: ; %bb.0: ; %entry 1376; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1377; GFX10-NEXT: s_mov_b32 s6, -1 1378; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1379; GFX10-NEXT: s_mov_b32 s10, s6 1380; GFX10-NEXT: s_mov_b32 s11, s7 1381; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX10-NEXT: s_mov_b32 s8, s2 1383; GFX10-NEXT: s_mov_b32 s9, s3 1384; GFX10-NEXT: s_mov_b32 s4, s0 1385; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1386; GFX10-NEXT: s_mov_b32 s5, s1 1387; GFX10-NEXT: s_waitcnt vmcnt(0) 1388; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 1389; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 1390; GFX10-NEXT: s_endpgm 1391; 1392; GFX11-LABEL: v_mul_i32: 1393; GFX11: ; %bb.0: ; %entry 1394; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1395; GFX11-NEXT: s_mov_b32 s6, -1 1396; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1397; GFX11-NEXT: s_mov_b32 s10, s6 1398; GFX11-NEXT: s_mov_b32 s11, s7 1399; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1400; GFX11-NEXT: s_mov_b32 s8, s2 1401; GFX11-NEXT: s_mov_b32 s9, s3 1402; GFX11-NEXT: s_mov_b32 s4, s0 1403; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 1404; GFX11-NEXT: s_mov_b32 s5, s1 1405; GFX11-NEXT: s_waitcnt vmcnt(0) 1406; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 1407; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 1408; GFX11-NEXT: s_endpgm 1409; 1410; GFX12-LABEL: v_mul_i32: 1411; GFX12: ; %bb.0: ; %entry 1412; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1413; GFX12-NEXT: s_mov_b32 s6, -1 1414; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1415; GFX12-NEXT: s_mov_b32 s10, s6 1416; GFX12-NEXT: s_mov_b32 s11, s7 1417; GFX12-NEXT: s_wait_kmcnt 0x0 1418; GFX12-NEXT: s_mov_b32 s8, s2 1419; GFX12-NEXT: s_mov_b32 s9, s3 1420; GFX12-NEXT: s_mov_b32 s4, s0 1421; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null 1422; GFX12-NEXT: s_mov_b32 s5, s1 1423; GFX12-NEXT: s_wait_loadcnt 0x0 1424; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 1425; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null 1426; GFX12-NEXT: s_endpgm 1427; 1428; EG-LABEL: v_mul_i32: 1429; EG: ; %bb.0: ; %entry 1430; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1431; EG-NEXT: TEX 0 @6 1432; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1433; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1434; EG-NEXT: CF_END 1435; EG-NEXT: PAD 1436; EG-NEXT: Fetch clause starting at 6: 1437; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1438; EG-NEXT: ALU clause starting at 8: 1439; EG-NEXT: MOV * T0.X, KC0[2].Z, 1440; EG-NEXT: ALU clause starting at 9: 1441; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1442; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y, 1443; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1444entry: 1445 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1446 %a = load i32, ptr addrspace(1) %in 1447 %b = load i32, ptr addrspace(1) %b_ptr 1448 %result = mul i32 %a, %b 1449 store i32 %result, ptr addrspace(1) %out 1450 ret void 1451} 1452 1453define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { 1454; SI-LABEL: s_mul_i1: 1455; SI: ; %bb.0: ; %entry 1456; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1457; SI-NEXT: s_load_dword s7, s[4:5], 0x1c 1458; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1459; SI-NEXT: s_mov_b32 s3, 0xf000 1460; SI-NEXT: s_mov_b32 s2, -1 1461; SI-NEXT: s_waitcnt lgkmcnt(0) 1462; SI-NEXT: s_mul_i32 s6, s6, s7 1463; SI-NEXT: s_and_b32 s4, s6, 1 1464; SI-NEXT: v_mov_b32_e32 v0, s4 1465; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1466; SI-NEXT: s_endpgm 1467; 1468; VI-LABEL: s_mul_i1: 1469; VI: ; %bb.0: ; %entry 1470; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1471; VI-NEXT: s_load_dword s7, s[4:5], 0x70 1472; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1473; VI-NEXT: s_mov_b32 s3, 0xf000 1474; VI-NEXT: s_mov_b32 s2, -1 1475; VI-NEXT: s_waitcnt lgkmcnt(0) 1476; VI-NEXT: s_mul_i32 s6, s6, s7 1477; VI-NEXT: s_and_b32 s4, s6, 1 1478; VI-NEXT: v_mov_b32_e32 v0, s4 1479; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1480; VI-NEXT: s_endpgm 1481; 1482; GFX9-LABEL: s_mul_i1: 1483; GFX9: ; %bb.0: ; %entry 1484; GFX9-NEXT: s_load_dword s6, s[4:5], 0x4c 1485; GFX9-NEXT: s_load_dword s7, s[4:5], 0x70 1486; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1487; GFX9-NEXT: s_mov_b32 s3, 0xf000 1488; GFX9-NEXT: s_mov_b32 s2, -1 1489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1490; GFX9-NEXT: s_mul_i32 s6, s6, s7 1491; GFX9-NEXT: s_and_b32 s4, s6, 1 1492; GFX9-NEXT: v_mov_b32_e32 v0, s4 1493; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 1494; GFX9-NEXT: s_endpgm 1495; 1496; GFX10-LABEL: s_mul_i1: 1497; GFX10: ; %bb.0: ; %entry 1498; GFX10-NEXT: s_clause 0x2 1499; GFX10-NEXT: s_load_dword s2, s[4:5], 0x4c 1500; GFX10-NEXT: s_load_dword s3, s[4:5], 0x70 1501; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1502; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1503; GFX10-NEXT: s_mul_i32 s2, s2, s3 1504; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1505; GFX10-NEXT: s_and_b32 s2, s2, 1 1506; GFX10-NEXT: v_mov_b32_e32 v0, s2 1507; GFX10-NEXT: s_mov_b32 s2, -1 1508; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0 1509; GFX10-NEXT: s_endpgm 1510; 1511; GFX11-LABEL: s_mul_i1: 1512; GFX11: ; %bb.0: ; %entry 1513; GFX11-NEXT: s_clause 0x2 1514; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x4c 1515; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x70 1516; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1517; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1518; GFX11-NEXT: s_mul_i32 s2, s2, s3 1519; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1520; GFX11-NEXT: s_and_b32 s2, s2, 1 1521; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1522; GFX11-NEXT: v_mov_b32_e32 v0, s2 1523; GFX11-NEXT: s_mov_b32 s2, -1 1524; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 1525; GFX11-NEXT: s_endpgm 1526; 1527; GFX12-LABEL: s_mul_i1: 1528; GFX12: ; %bb.0: ; %entry 1529; GFX12-NEXT: s_clause 0x2 1530; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x4c 1531; GFX12-NEXT: s_load_b32 s3, s[4:5], 0x70 1532; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1533; GFX12-NEXT: s_wait_kmcnt 0x0 1534; GFX12-NEXT: s_mul_i32 s2, s2, s3 1535; GFX12-NEXT: s_mov_b32 s3, 0x31016000 1536; GFX12-NEXT: s_and_b32 s2, s2, 1 1537; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1538; GFX12-NEXT: v_mov_b32_e32 v0, s2 1539; GFX12-NEXT: s_mov_b32 s2, -1 1540; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null 1541; GFX12-NEXT: s_endpgm 1542; 1543; EG-LABEL: s_mul_i1: 1544; EG: ; %bb.0: ; %entry 1545; EG-NEXT: ALU 0, @10, KC0[], KC1[] 1546; EG-NEXT: TEX 1 @6 1547; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1548; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1549; EG-NEXT: CF_END 1550; EG-NEXT: PAD 1551; EG-NEXT: Fetch clause starting at 6: 1552; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3 1553; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3 1554; EG-NEXT: ALU clause starting at 10: 1555; EG-NEXT: MOV * T0.X, 0.0, 1556; EG-NEXT: ALU clause starting at 11: 1557; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1558; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 1559; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1560; EG-NEXT: AND_INT T1.W, PS, 1, 1561; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1562; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1563; EG-NEXT: LSHL T0.X, PV.W, PS, 1564; EG-NEXT: LSHL * T0.W, literal.x, PS, 1565; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1566; EG-NEXT: MOV T0.Y, 0.0, 1567; EG-NEXT: MOV * T0.Z, 0.0, 1568; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1569; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1570entry: 1571 %mul = mul i1 %a, %b 1572 store i1 %mul, ptr addrspace(1) %out, align 4 1573 ret void 1574} 1575 1576define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1577; SI-LABEL: v_mul_i1: 1578; SI: ; %bb.0: ; %entry 1579; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1580; SI-NEXT: s_mov_b32 s7, 0xf000 1581; SI-NEXT: s_mov_b32 s6, -1 1582; SI-NEXT: s_mov_b32 s10, s6 1583; SI-NEXT: s_mov_b32 s11, s7 1584; SI-NEXT: s_waitcnt lgkmcnt(0) 1585; SI-NEXT: s_mov_b32 s8, s2 1586; SI-NEXT: s_mov_b32 s9, s3 1587; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1588; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1589; SI-NEXT: s_mov_b32 s4, s0 1590; SI-NEXT: s_mov_b32 s5, s1 1591; SI-NEXT: s_waitcnt vmcnt(0) 1592; SI-NEXT: v_mul_lo_u32 v0, v0, v1 1593; SI-NEXT: v_and_b32_e32 v0, 1, v0 1594; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1595; SI-NEXT: s_endpgm 1596; 1597; VI-LABEL: v_mul_i1: 1598; VI: ; %bb.0: ; %entry 1599; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1600; VI-NEXT: s_mov_b32 s7, 0xf000 1601; VI-NEXT: s_mov_b32 s6, -1 1602; VI-NEXT: s_mov_b32 s10, s6 1603; VI-NEXT: s_mov_b32 s11, s7 1604; VI-NEXT: s_waitcnt lgkmcnt(0) 1605; VI-NEXT: s_mov_b32 s8, s2 1606; VI-NEXT: s_mov_b32 s9, s3 1607; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1608; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1609; VI-NEXT: s_mov_b32 s4, s0 1610; VI-NEXT: s_mov_b32 s5, s1 1611; VI-NEXT: s_waitcnt vmcnt(0) 1612; VI-NEXT: v_mul_lo_u32 v0, v0, v1 1613; VI-NEXT: v_and_b32_e32 v0, 1, v0 1614; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1615; VI-NEXT: s_endpgm 1616; 1617; GFX9-LABEL: v_mul_i1: 1618; GFX9: ; %bb.0: ; %entry 1619; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1620; GFX9-NEXT: s_mov_b32 s7, 0xf000 1621; GFX9-NEXT: s_mov_b32 s6, -1 1622; GFX9-NEXT: s_mov_b32 s10, s6 1623; GFX9-NEXT: s_mov_b32 s11, s7 1624; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1625; GFX9-NEXT: s_mov_b32 s8, s2 1626; GFX9-NEXT: s_mov_b32 s9, s3 1627; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1628; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1629; GFX9-NEXT: s_mov_b32 s4, s0 1630; GFX9-NEXT: s_mov_b32 s5, s1 1631; GFX9-NEXT: s_waitcnt vmcnt(0) 1632; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 1633; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1634; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 1635; GFX9-NEXT: s_endpgm 1636; 1637; GFX10-LABEL: v_mul_i1: 1638; GFX10: ; %bb.0: ; %entry 1639; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1640; GFX10-NEXT: s_mov_b32 s6, -1 1641; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1642; GFX10-NEXT: s_mov_b32 s10, s6 1643; GFX10-NEXT: s_mov_b32 s11, s7 1644; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1645; GFX10-NEXT: s_mov_b32 s8, s2 1646; GFX10-NEXT: s_mov_b32 s9, s3 1647; GFX10-NEXT: s_clause 0x1 1648; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1649; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1650; GFX10-NEXT: s_mov_b32 s4, s0 1651; GFX10-NEXT: s_mov_b32 s5, s1 1652; GFX10-NEXT: s_waitcnt vmcnt(0) 1653; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 1654; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 1655; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 1656; GFX10-NEXT: s_endpgm 1657; 1658; GFX11-LABEL: v_mul_i1: 1659; GFX11: ; %bb.0: ; %entry 1660; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1661; GFX11-NEXT: s_mov_b32 s6, -1 1662; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1663; GFX11-NEXT: s_mov_b32 s10, s6 1664; GFX11-NEXT: s_mov_b32 s11, s7 1665; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1666; GFX11-NEXT: s_mov_b32 s8, s2 1667; GFX11-NEXT: s_mov_b32 s9, s3 1668; GFX11-NEXT: s_clause 0x1 1669; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0 1670; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4 1671; GFX11-NEXT: s_mov_b32 s4, s0 1672; GFX11-NEXT: s_mov_b32 s5, s1 1673; GFX11-NEXT: s_waitcnt vmcnt(0) 1674; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 1675; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1676; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 1677; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 1678; GFX11-NEXT: s_endpgm 1679; 1680; GFX12-LABEL: v_mul_i1: 1681; GFX12: ; %bb.0: ; %entry 1682; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1683; GFX12-NEXT: s_mov_b32 s6, -1 1684; GFX12-NEXT: s_mov_b32 s7, 0x31016000 1685; GFX12-NEXT: s_mov_b32 s10, s6 1686; GFX12-NEXT: s_mov_b32 s11, s7 1687; GFX12-NEXT: s_wait_kmcnt 0x0 1688; GFX12-NEXT: s_mov_b32 s8, s2 1689; GFX12-NEXT: s_mov_b32 s9, s3 1690; GFX12-NEXT: s_clause 0x1 1691; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null 1692; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 1693; GFX12-NEXT: s_mov_b32 s4, s0 1694; GFX12-NEXT: s_mov_b32 s5, s1 1695; GFX12-NEXT: s_wait_loadcnt 0x0 1696; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 1697; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1698; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 1699; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null 1700; GFX12-NEXT: s_endpgm 1701; 1702; EG-LABEL: v_mul_i1: 1703; EG: ; %bb.0: ; %entry 1704; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1705; EG-NEXT: TEX 1 @6 1706; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1707; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1708; EG-NEXT: CF_END 1709; EG-NEXT: PAD 1710; EG-NEXT: Fetch clause starting at 6: 1711; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1 1712; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1713; EG-NEXT: ALU clause starting at 10: 1714; EG-NEXT: MOV * T0.X, KC0[2].Z, 1715; EG-NEXT: ALU clause starting at 11: 1716; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1717; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 1718; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1719; EG-NEXT: AND_INT T1.W, PS, 1, 1720; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1721; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1722; EG-NEXT: LSHL T0.X, PV.W, PS, 1723; EG-NEXT: LSHL * T0.W, literal.x, PS, 1724; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1725; EG-NEXT: MOV T0.Y, 0.0, 1726; EG-NEXT: MOV * T0.Z, 0.0, 1727; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1728; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1729entry: 1730 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1731 %a = load i1, ptr addrspace(1) %in 1732 %b = load i1, ptr addrspace(1) %b_ptr 1733 %result = mul i1 %a, %b 1734 store i1 %result, ptr addrspace(1) %out 1735 ret void 1736} 1737 1738; A standard 64-bit multiply. The expansion should be around 6 instructions. 1739; It would be difficult to match the expansion correctly without writing 1740; a really complicated list of FileCheck expressions. I don't want 1741; to confuse people who may 'break' this test with a correct optimization, 1742; so this test just uses FUNC-LABEL to make sure the compiler does not 1743; crash with a 'failed to select' error. 1744 1745define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 1746; SI-LABEL: s_mul_i64: 1747; SI: ; %bb.0: ; %entry 1748; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1749; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1750; SI-NEXT: s_mov_b32 s7, 0xf000 1751; SI-NEXT: s_mov_b32 s6, -1 1752; SI-NEXT: s_waitcnt lgkmcnt(0) 1753; SI-NEXT: s_mov_b32 s4, s0 1754; SI-NEXT: v_mov_b32_e32 v0, s8 1755; SI-NEXT: v_mul_hi_u32 v0, s2, v0 1756; SI-NEXT: s_mul_i32 s0, s2, s9 1757; SI-NEXT: s_mov_b32 s5, s1 1758; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 1759; SI-NEXT: s_mul_i32 s0, s3, s8 1760; SI-NEXT: v_add_i32_e32 v1, vcc, s0, v0 1761; SI-NEXT: s_mul_i32 s0, s2, s8 1762; SI-NEXT: v_mov_b32_e32 v0, s0 1763; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1764; SI-NEXT: s_endpgm 1765; 1766; VI-LABEL: s_mul_i64: 1767; VI: ; %bb.0: ; %entry 1768; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1769; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1770; VI-NEXT: s_mov_b32 s7, 0xf000 1771; VI-NEXT: s_mov_b32 s6, -1 1772; VI-NEXT: s_waitcnt lgkmcnt(0) 1773; VI-NEXT: s_mov_b32 s4, s0 1774; VI-NEXT: v_mov_b32_e32 v0, s8 1775; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s2, v0, 0 1776; VI-NEXT: s_mul_i32 s0, s2, s9 1777; VI-NEXT: s_mov_b32 s5, s1 1778; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 1779; VI-NEXT: s_mul_i32 s0, s3, s8 1780; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 1781; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1782; VI-NEXT: s_endpgm 1783; 1784; GFX9-LABEL: s_mul_i64: 1785; GFX9: ; %bb.0: ; %entry 1786; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1787; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1788; GFX9-NEXT: s_mov_b32 s7, 0xf000 1789; GFX9-NEXT: s_mov_b32 s6, -1 1790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1791; GFX9-NEXT: s_mov_b32 s4, s0 1792; GFX9-NEXT: s_mov_b32 s5, s1 1793; GFX9-NEXT: s_mul_i32 s0, s2, s9 1794; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8 1795; GFX9-NEXT: s_add_i32 s0, s1, s0 1796; GFX9-NEXT: s_mul_i32 s1, s3, s8 1797; GFX9-NEXT: s_add_i32 s0, s0, s1 1798; GFX9-NEXT: s_mul_i32 s1, s2, s8 1799; GFX9-NEXT: v_mov_b32_e32 v0, s1 1800; GFX9-NEXT: v_mov_b32_e32 v1, s0 1801; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1802; GFX9-NEXT: s_endpgm 1803; 1804; GFX10-LABEL: s_mul_i64: 1805; GFX10: ; %bb.0: ; %entry 1806; GFX10-NEXT: s_clause 0x1 1807; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1808; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1809; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1810; GFX10-NEXT: s_mul_i32 s4, s2, s7 1811; GFX10-NEXT: s_mul_hi_u32 s5, s2, s6 1812; GFX10-NEXT: s_mul_i32 s3, s3, s6 1813; GFX10-NEXT: s_add_i32 s4, s5, s4 1814; GFX10-NEXT: s_mul_i32 s2, s2, s6 1815; GFX10-NEXT: s_add_i32 s4, s4, s3 1816; GFX10-NEXT: v_mov_b32_e32 v0, s2 1817; GFX10-NEXT: v_mov_b32_e32 v1, s4 1818; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1819; GFX10-NEXT: s_mov_b32 s6, -1 1820; GFX10-NEXT: s_mov_b32 s4, s0 1821; GFX10-NEXT: s_mov_b32 s5, s1 1822; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1823; GFX10-NEXT: s_endpgm 1824; 1825; GFX11-LABEL: s_mul_i64: 1826; GFX11: ; %bb.0: ; %entry 1827; GFX11-NEXT: s_clause 0x1 1828; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1829; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1830; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1831; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1832; GFX11-NEXT: s_mul_i32 s5, s2, s5 1833; GFX11-NEXT: s_mul_hi_u32 s6, s2, s4 1834; GFX11-NEXT: s_mul_i32 s3, s3, s4 1835; GFX11-NEXT: s_add_i32 s5, s6, s5 1836; GFX11-NEXT: s_mul_i32 s2, s2, s4 1837; GFX11-NEXT: s_add_i32 s5, s5, s3 1838; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1839; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5 1840; GFX11-NEXT: s_mov_b32 s6, -1 1841; GFX11-NEXT: s_mov_b32 s4, s0 1842; GFX11-NEXT: s_mov_b32 s5, s1 1843; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 1844; GFX11-NEXT: s_endpgm 1845; 1846; GFX12-LABEL: s_mul_i64: 1847; GFX12: ; %bb.0: ; %entry 1848; GFX12-NEXT: s_clause 0x1 1849; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1850; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1851; GFX12-NEXT: s_wait_kmcnt 0x0 1852; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] 1853; GFX12-NEXT: s_mov_b32 s3, 0x31016000 1854; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 1855; GFX12-NEXT: s_mov_b32 s2, -1 1856; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 1857; GFX12-NEXT: s_endpgm 1858; 1859; EG-LABEL: s_mul_i64: 1860; EG: ; %bb.0: ; %entry 1861; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1862; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1863; EG-NEXT: CF_END 1864; EG-NEXT: PAD 1865; EG-NEXT: ALU clause starting at 4: 1866; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y, 1867; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z, 1868; EG-NEXT: ADD_INT T0.W, T0.X, PS, 1869; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y, 1870; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1871; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1872; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1873; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y, 1874entry: 1875 %mul = mul i64 %a, %b 1876 store i64 %mul, ptr addrspace(1) %out, align 8 1877 ret void 1878} 1879 1880define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 1881; SI-LABEL: v_mul_i64: 1882; SI: ; %bb.0: ; %entry 1883; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1884; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1885; SI-NEXT: s_mov_b32 s7, 0xf000 1886; SI-NEXT: s_mov_b32 s6, -1 1887; SI-NEXT: s_mov_b32 s10, s6 1888; SI-NEXT: s_mov_b32 s11, s7 1889; SI-NEXT: s_waitcnt lgkmcnt(0) 1890; SI-NEXT: s_mov_b32 s12, s2 1891; SI-NEXT: s_mov_b32 s13, s3 1892; SI-NEXT: s_mov_b32 s14, s6 1893; SI-NEXT: s_mov_b32 s15, s7 1894; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1895; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1896; SI-NEXT: s_mov_b32 s4, s0 1897; SI-NEXT: s_mov_b32 s5, s1 1898; SI-NEXT: s_waitcnt vmcnt(0) 1899; SI-NEXT: v_mul_lo_u32 v1, v2, v1 1900; SI-NEXT: v_mul_hi_u32 v4, v2, v0 1901; SI-NEXT: v_mul_lo_u32 v3, v3, v0 1902; SI-NEXT: v_mul_lo_u32 v0, v2, v0 1903; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1904; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1905; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1906; SI-NEXT: s_endpgm 1907; 1908; VI-LABEL: v_mul_i64: 1909; VI: ; %bb.0: ; %entry 1910; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1911; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1912; VI-NEXT: s_mov_b32 s7, 0xf000 1913; VI-NEXT: s_mov_b32 s6, -1 1914; VI-NEXT: s_mov_b32 s10, s6 1915; VI-NEXT: s_mov_b32 s11, s7 1916; VI-NEXT: s_waitcnt lgkmcnt(0) 1917; VI-NEXT: s_mov_b32 s12, s2 1918; VI-NEXT: s_mov_b32 s13, s3 1919; VI-NEXT: s_mov_b32 s14, s6 1920; VI-NEXT: s_mov_b32 s15, s7 1921; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1922; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1923; VI-NEXT: s_mov_b32 s4, s0 1924; VI-NEXT: s_mov_b32 s5, s1 1925; VI-NEXT: s_waitcnt vmcnt(0) 1926; VI-NEXT: v_mul_lo_u32 v4, v2, v1 1927; VI-NEXT: v_mad_u64_u32 v[1:2], s[2:3], v2, v0, 0 1928; VI-NEXT: v_mul_lo_u32 v0, v3, v0 1929; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 1930; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 1931; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 1932; VI-NEXT: s_endpgm 1933; 1934; GFX9-LABEL: v_mul_i64: 1935; GFX9: ; %bb.0: ; %entry 1936; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1937; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1938; GFX9-NEXT: s_mov_b32 s7, 0xf000 1939; GFX9-NEXT: s_mov_b32 s6, -1 1940; GFX9-NEXT: s_mov_b32 s10, s6 1941; GFX9-NEXT: s_mov_b32 s11, s7 1942; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX9-NEXT: s_mov_b32 s12, s2 1944; GFX9-NEXT: s_mov_b32 s13, s3 1945; GFX9-NEXT: s_mov_b32 s14, s6 1946; GFX9-NEXT: s_mov_b32 s15, s7 1947; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1948; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1949; GFX9-NEXT: s_mov_b32 s4, s0 1950; GFX9-NEXT: s_mov_b32 s5, s1 1951; GFX9-NEXT: s_waitcnt vmcnt(0) 1952; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 1953; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 1954; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 1955; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 1956; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 1957; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1958; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1959; GFX9-NEXT: s_endpgm 1960; 1961; GFX10-LABEL: v_mul_i64: 1962; GFX10: ; %bb.0: ; %entry 1963; GFX10-NEXT: s_clause 0x1 1964; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1965; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 1966; GFX10-NEXT: s_mov_b32 s6, -1 1967; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1968; GFX10-NEXT: s_mov_b32 s10, s6 1969; GFX10-NEXT: s_mov_b32 s11, s7 1970; GFX10-NEXT: s_mov_b32 s14, s6 1971; GFX10-NEXT: s_mov_b32 s15, s7 1972; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1973; GFX10-NEXT: s_mov_b32 s12, s2 1974; GFX10-NEXT: s_mov_b32 s13, s3 1975; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1976; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1977; GFX10-NEXT: s_mov_b32 s4, s0 1978; GFX10-NEXT: s_mov_b32 s5, s1 1979; GFX10-NEXT: s_waitcnt vmcnt(0) 1980; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 1981; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 1982; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 1983; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 1984; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 1985; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 1986; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1987; GFX10-NEXT: s_endpgm 1988; 1989; GFX11-LABEL: v_mul_i64: 1990; GFX11: ; %bb.0: ; %entry 1991; GFX11-NEXT: s_clause 0x1 1992; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1993; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1994; GFX11-NEXT: s_mov_b32 s10, -1 1995; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1996; GFX11-NEXT: s_mov_b32 s6, s10 1997; GFX11-NEXT: s_mov_b32 s7, s11 1998; GFX11-NEXT: s_mov_b32 s14, s10 1999; GFX11-NEXT: s_mov_b32 s15, s11 2000; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2001; GFX11-NEXT: s_mov_b32 s12, s2 2002; GFX11-NEXT: s_mov_b32 s13, s3 2003; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 2004; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 2005; GFX11-NEXT: s_mov_b32 s8, s0 2006; GFX11-NEXT: s_mov_b32 s9, s1 2007; GFX11-NEXT: s_waitcnt vmcnt(0) 2008; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 2009; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 2010; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 2011; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 2012; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2013; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 2014; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 2015; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 2016; GFX11-NEXT: s_endpgm 2017; 2018; GFX12-LABEL: v_mul_i64: 2019; GFX12: ; %bb.0: ; %entry 2020; GFX12-NEXT: s_clause 0x1 2021; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2022; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2023; GFX12-NEXT: s_mov_b32 s10, -1 2024; GFX12-NEXT: s_mov_b32 s11, 0x31016000 2025; GFX12-NEXT: s_mov_b32 s6, s10 2026; GFX12-NEXT: s_mov_b32 s7, s11 2027; GFX12-NEXT: s_mov_b32 s14, s10 2028; GFX12-NEXT: s_mov_b32 s15, s11 2029; GFX12-NEXT: s_wait_kmcnt 0x0 2030; GFX12-NEXT: s_mov_b32 s12, s2 2031; GFX12-NEXT: s_mov_b32 s13, s3 2032; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null 2033; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null 2034; GFX12-NEXT: s_mov_b32 s8, s0 2035; GFX12-NEXT: s_mov_b32 s9, s1 2036; GFX12-NEXT: s_wait_loadcnt 0x0 2037; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3 2038; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2 2039; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 2040; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 2041; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2042; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1 2043; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4 2044; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null 2045; GFX12-NEXT: s_endpgm 2046; 2047; EG-LABEL: v_mul_i64: 2048; EG: ; %bb.0: ; %entry 2049; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2050; EG-NEXT: TEX 1 @6 2051; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[] 2052; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1 2053; EG-NEXT: CF_END 2054; EG-NEXT: PAD 2055; EG-NEXT: Fetch clause starting at 6: 2056; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 2057; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 2058; EG-NEXT: ALU clause starting at 10: 2059; EG-NEXT: MOV T0.X, KC0[2].Z, 2060; EG-NEXT: MOV * T1.X, KC0[2].W, 2061; EG-NEXT: ALU clause starting at 12: 2062; EG-NEXT: MULHI * T0.Z, T0.X, T1.X, 2063; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y, 2064; EG-NEXT: ADD_INT T0.W, T0.Z, PS, 2065; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X, 2066; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 2067; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 2068; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 2069; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2070entry: 2071 %a = load i64, ptr addrspace(1) %aptr, align 8 2072 %b = load i64, ptr addrspace(1) %bptr, align 8 2073 %mul = mul i64 %a, %b 2074 store i64 %mul, ptr addrspace(1) %out, align 8 2075 ret void 2076} 2077 2078define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { 2079; SI-LABEL: mul32_in_branch: 2080; SI: ; %bb.0: ; %entry 2081; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2082; SI-NEXT: s_waitcnt lgkmcnt(0) 2083; SI-NEXT: s_cmp_lg_u32 s0, 0 2084; SI-NEXT: s_cbranch_scc0 .LBB15_2 2085; SI-NEXT: ; %bb.1: ; %else 2086; SI-NEXT: s_mul_i32 s8, s0, s1 2087; SI-NEXT: s_mov_b64 s[6:7], 0 2088; SI-NEXT: s_branch .LBB15_3 2089; SI-NEXT: .LBB15_2: 2090; SI-NEXT: s_mov_b64 s[6:7], -1 2091; SI-NEXT: ; implicit-def: $sgpr8 2092; SI-NEXT: .LBB15_3: ; %Flow 2093; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2094; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7] 2095; SI-NEXT: s_waitcnt lgkmcnt(0) 2096; SI-NEXT: s_mov_b64 vcc, vcc 2097; SI-NEXT: s_cbranch_vccnz .LBB15_5 2098; SI-NEXT: ; %bb.4: ; %if 2099; SI-NEXT: s_mov_b32 s7, 0xf000 2100; SI-NEXT: s_mov_b32 s6, -1 2101; SI-NEXT: s_mov_b32 s4, s2 2102; SI-NEXT: s_mov_b32 s5, s3 2103; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 2104; SI-NEXT: s_branch .LBB15_6 2105; SI-NEXT: .LBB15_5: 2106; SI-NEXT: v_mov_b32_e32 v0, s8 2107; SI-NEXT: .LBB15_6: ; %endif 2108; SI-NEXT: s_mov_b32 s3, 0xf000 2109; SI-NEXT: s_mov_b32 s2, -1 2110; SI-NEXT: s_waitcnt vmcnt(0) 2111; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2112; SI-NEXT: s_endpgm 2113; 2114; VI-LABEL: mul32_in_branch: 2115; VI: ; %bb.0: ; %entry 2116; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2117; VI-NEXT: s_waitcnt lgkmcnt(0) 2118; VI-NEXT: s_cmp_lg_u32 s0, 0 2119; VI-NEXT: s_cbranch_scc0 .LBB15_2 2120; VI-NEXT: ; %bb.1: ; %else 2121; VI-NEXT: s_mul_i32 s8, s0, s1 2122; VI-NEXT: s_mov_b64 s[6:7], 0 2123; VI-NEXT: s_branch .LBB15_3 2124; VI-NEXT: .LBB15_2: 2125; VI-NEXT: s_mov_b64 s[6:7], -1 2126; VI-NEXT: ; implicit-def: $sgpr8 2127; VI-NEXT: .LBB15_3: ; %Flow 2128; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2129; VI-NEXT: s_andn2_b64 vcc, exec, s[6:7] 2130; VI-NEXT: s_cbranch_vccnz .LBB15_5 2131; VI-NEXT: ; %bb.4: ; %if 2132; VI-NEXT: s_mov_b32 s7, 0xf000 2133; VI-NEXT: s_mov_b32 s6, -1 2134; VI-NEXT: s_waitcnt lgkmcnt(0) 2135; VI-NEXT: s_mov_b32 s4, s2 2136; VI-NEXT: s_mov_b32 s5, s3 2137; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 2138; VI-NEXT: s_branch .LBB15_6 2139; VI-NEXT: .LBB15_5: 2140; VI-NEXT: v_mov_b32_e32 v0, s8 2141; VI-NEXT: .LBB15_6: ; %endif 2142; VI-NEXT: s_waitcnt lgkmcnt(0) 2143; VI-NEXT: s_mov_b32 s3, 0xf000 2144; VI-NEXT: s_mov_b32 s2, -1 2145; VI-NEXT: s_waitcnt vmcnt(0) 2146; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2147; VI-NEXT: s_endpgm 2148; 2149; GFX9-LABEL: mul32_in_branch: 2150; GFX9: ; %bb.0: ; %entry 2151; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2152; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2153; GFX9-NEXT: s_cmp_lg_u32 s0, 0 2154; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 2155; GFX9-NEXT: ; %bb.1: ; %else 2156; GFX9-NEXT: s_mul_i32 s8, s0, s1 2157; GFX9-NEXT: s_mov_b64 s[6:7], 0 2158; GFX9-NEXT: s_branch .LBB15_3 2159; GFX9-NEXT: .LBB15_2: 2160; GFX9-NEXT: s_mov_b64 s[6:7], -1 2161; GFX9-NEXT: ; implicit-def: $sgpr8 2162; GFX9-NEXT: .LBB15_3: ; %Flow 2163; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2164; GFX9-NEXT: s_andn2_b64 vcc, exec, s[6:7] 2165; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 2166; GFX9-NEXT: ; %bb.4: ; %if 2167; GFX9-NEXT: s_mov_b32 s7, 0xf000 2168; GFX9-NEXT: s_mov_b32 s6, -1 2169; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2170; GFX9-NEXT: s_mov_b32 s4, s2 2171; GFX9-NEXT: s_mov_b32 s5, s3 2172; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 2173; GFX9-NEXT: s_branch .LBB15_6 2174; GFX9-NEXT: .LBB15_5: 2175; GFX9-NEXT: v_mov_b32_e32 v0, s8 2176; GFX9-NEXT: .LBB15_6: ; %endif 2177; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2178; GFX9-NEXT: s_mov_b32 s3, 0xf000 2179; GFX9-NEXT: s_mov_b32 s2, -1 2180; GFX9-NEXT: s_waitcnt vmcnt(0) 2181; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2182; GFX9-NEXT: s_endpgm 2183; 2184; GFX10-LABEL: mul32_in_branch: 2185; GFX10: ; %bb.0: ; %entry 2186; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2187; GFX10-NEXT: s_mov_b32 s6, 0 2188; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2189; GFX10-NEXT: s_cmp_lg_u32 s0, 0 2190; GFX10-NEXT: s_cbranch_scc0 .LBB15_2 2191; GFX10-NEXT: ; %bb.1: ; %else 2192; GFX10-NEXT: s_mul_i32 s7, s0, s1 2193; GFX10-NEXT: s_branch .LBB15_3 2194; GFX10-NEXT: .LBB15_2: 2195; GFX10-NEXT: s_mov_b32 s6, -1 2196; GFX10-NEXT: ; implicit-def: $sgpr7 2197; GFX10-NEXT: .LBB15_3: ; %Flow 2198; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2199; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 2200; GFX10-NEXT: s_cbranch_vccnz .LBB15_5 2201; GFX10-NEXT: ; %bb.4: ; %if 2202; GFX10-NEXT: s_mov_b32 s7, 0x31016000 2203; GFX10-NEXT: s_mov_b32 s6, -1 2204; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2205; GFX10-NEXT: s_mov_b32 s4, s2 2206; GFX10-NEXT: s_mov_b32 s5, s3 2207; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 2208; GFX10-NEXT: s_branch .LBB15_6 2209; GFX10-NEXT: .LBB15_5: 2210; GFX10-NEXT: v_mov_b32_e32 v0, s7 2211; GFX10-NEXT: .LBB15_6: ; %endif 2212; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2214; GFX10-NEXT: s_mov_b32 s2, -1 2215; GFX10-NEXT: s_waitcnt vmcnt(0) 2216; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 2217; GFX10-NEXT: s_endpgm 2218; 2219; GFX11-LABEL: mul32_in_branch: 2220; GFX11: ; %bb.0: ; %entry 2221; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2222; GFX11-NEXT: s_mov_b32 s6, 0 2223; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX11-NEXT: s_cmp_lg_u32 s0, 0 2225; GFX11-NEXT: s_cbranch_scc0 .LBB15_2 2226; GFX11-NEXT: ; %bb.1: ; %else 2227; GFX11-NEXT: s_mul_i32 s7, s0, s1 2228; GFX11-NEXT: s_branch .LBB15_3 2229; GFX11-NEXT: .LBB15_2: 2230; GFX11-NEXT: s_mov_b32 s6, -1 2231; GFX11-NEXT: ; implicit-def: $sgpr7 2232; GFX11-NEXT: .LBB15_3: ; %Flow 2233; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2234; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 2235; GFX11-NEXT: s_cbranch_vccnz .LBB15_5 2236; GFX11-NEXT: ; %bb.4: ; %if 2237; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2238; GFX11-NEXT: s_mov_b32 s6, -1 2239; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2240; GFX11-NEXT: s_mov_b32 s4, s2 2241; GFX11-NEXT: s_mov_b32 s5, s3 2242; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 2243; GFX11-NEXT: s_branch .LBB15_6 2244; GFX11-NEXT: .LBB15_5: 2245; GFX11-NEXT: v_mov_b32_e32 v0, s7 2246; GFX11-NEXT: .LBB15_6: ; %endif 2247; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2248; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2249; GFX11-NEXT: s_mov_b32 s2, -1 2250; GFX11-NEXT: s_waitcnt vmcnt(0) 2251; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 2252; GFX11-NEXT: s_endpgm 2253; 2254; GFX12-LABEL: mul32_in_branch: 2255; GFX12: ; %bb.0: ; %entry 2256; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2257; GFX12-NEXT: s_mov_b32 s6, 0 2258; GFX12-NEXT: s_wait_kmcnt 0x0 2259; GFX12-NEXT: s_cmp_lg_u32 s0, 0 2260; GFX12-NEXT: s_cbranch_scc0 .LBB15_2 2261; GFX12-NEXT: ; %bb.1: ; %else 2262; GFX12-NEXT: s_mul_i32 s7, s0, s1 2263; GFX12-NEXT: s_branch .LBB15_3 2264; GFX12-NEXT: .LBB15_2: 2265; GFX12-NEXT: s_mov_b32 s6, -1 2266; GFX12-NEXT: ; implicit-def: $sgpr7 2267; GFX12-NEXT: .LBB15_3: ; %Flow 2268; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2269; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 2270; GFX12-NEXT: s_cbranch_vccnz .LBB15_5 2271; GFX12-NEXT: ; %bb.4: ; %if 2272; GFX12-NEXT: s_mov_b32 s7, 0x31016000 2273; GFX12-NEXT: s_mov_b32 s6, -1 2274; GFX12-NEXT: s_wait_kmcnt 0x0 2275; GFX12-NEXT: s_mov_b32 s4, s2 2276; GFX12-NEXT: s_mov_b32 s5, s3 2277; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null 2278; GFX12-NEXT: s_branch .LBB15_6 2279; GFX12-NEXT: .LBB15_5: 2280; GFX12-NEXT: v_mov_b32_e32 v0, s7 2281; GFX12-NEXT: .LBB15_6: ; %endif 2282; GFX12-NEXT: s_wait_kmcnt 0x0 2283; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2284; GFX12-NEXT: s_mov_b32 s2, -1 2285; GFX12-NEXT: s_wait_loadcnt 0x0 2286; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null 2287; GFX12-NEXT: s_endpgm 2288; 2289; EG-LABEL: mul32_in_branch: 2290; EG: ; %bb.0: ; %entry 2291; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] 2292; EG-NEXT: JUMP @3 POP:1 2293; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[] 2294; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[] 2295; EG-NEXT: JUMP @8 POP:1 2296; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] 2297; EG-NEXT: TEX 0 @12 2298; EG-NEXT: POP @8 POP:1 2299; EG-NEXT: ALU 1, @27, KC0[], KC1[] 2300; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2301; EG-NEXT: CF_END 2302; EG-NEXT: PAD 2303; EG-NEXT: Fetch clause starting at 12: 2304; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2305; EG-NEXT: ALU clause starting at 14: 2306; EG-NEXT: MOV T0.W, literal.x, 2307; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, 2308; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 2309; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2310; EG-NEXT: ALU clause starting at 18: 2311; EG-NEXT: MOV T1.W, KC0[2].W, 2312; EG-NEXT: MOV * T2.W, KC0[3].X, 2313; EG-NEXT: MOV T0.W, literal.x, 2314; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 2315; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2316; EG-NEXT: ALU clause starting at 23: 2317; EG-NEXT: MOV T1.W, KC0[2].Y, 2318; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, 2319; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2320; EG-NEXT: ALU clause starting at 26: 2321; EG-NEXT: MOV * T0.X, KC0[2].Z, 2322; EG-NEXT: ALU clause starting at 27: 2323; EG-NEXT: LSHR * T1.X, T1.W, literal.x, 2324; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2325entry: 2326 %0 = icmp eq i32 %a, 0 2327 br i1 %0, label %if, label %else 2328 2329if: 2330 %1 = load i32, ptr addrspace(1) %in 2331 br label %endif 2332 2333else: 2334 %2 = mul i32 %a, %b 2335 br label %endif 2336 2337endif: 2338 %3 = phi i32 [%1, %if], [%2, %else] 2339 store i32 %3, ptr addrspace(1) %out 2340 ret void 2341} 2342 2343define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { 2344; SI-LABEL: mul64_in_branch: 2345; SI: ; %bb.0: ; %entry 2346; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 2347; SI-NEXT: s_mov_b64 s[8:9], 0 2348; SI-NEXT: s_waitcnt lgkmcnt(0) 2349; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 2350; SI-NEXT: s_and_b64 vcc, exec, s[10:11] 2351; SI-NEXT: s_cbranch_vccz .LBB16_4 2352; SI-NEXT: ; %bb.1: ; %else 2353; SI-NEXT: v_mov_b32_e32 v0, s6 2354; SI-NEXT: v_mul_hi_u32 v0, s4, v0 2355; SI-NEXT: s_mul_i32 s7, s4, s7 2356; SI-NEXT: s_mul_i32 s5, s5, s6 2357; SI-NEXT: s_mul_i32 s4, s4, s6 2358; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 2359; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 2360; SI-NEXT: v_mov_b32_e32 v0, s4 2361; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 2362; SI-NEXT: s_cbranch_vccnz .LBB16_3 2363; SI-NEXT: .LBB16_2: ; %if 2364; SI-NEXT: s_mov_b32 s7, 0xf000 2365; SI-NEXT: s_mov_b32 s6, -1 2366; SI-NEXT: s_mov_b32 s4, s2 2367; SI-NEXT: s_mov_b32 s5, s3 2368; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2369; SI-NEXT: .LBB16_3: ; %endif 2370; SI-NEXT: s_mov_b32 s3, 0xf000 2371; SI-NEXT: s_mov_b32 s2, -1 2372; SI-NEXT: s_waitcnt vmcnt(0) 2373; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2374; SI-NEXT: s_endpgm 2375; SI-NEXT: .LBB16_4: 2376; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 2377; SI-NEXT: s_branch .LBB16_2 2378; 2379; VI-LABEL: mul64_in_branch: 2380; VI: ; %bb.0: ; %entry 2381; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 2382; VI-NEXT: s_mov_b64 s[8:9], 0 2383; VI-NEXT: s_waitcnt lgkmcnt(0) 2384; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 2385; VI-NEXT: s_cbranch_scc0 .LBB16_4 2386; VI-NEXT: ; %bb.1: ; %else 2387; VI-NEXT: v_mov_b32_e32 v0, s6 2388; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 2389; VI-NEXT: s_mul_i32 s4, s4, s7 2390; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 2391; VI-NEXT: s_mul_i32 s4, s5, s6 2392; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 2393; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 2394; VI-NEXT: s_cbranch_vccnz .LBB16_3 2395; VI-NEXT: .LBB16_2: ; %if 2396; VI-NEXT: s_mov_b32 s7, 0xf000 2397; VI-NEXT: s_mov_b32 s6, -1 2398; VI-NEXT: s_mov_b32 s4, s2 2399; VI-NEXT: s_mov_b32 s5, s3 2400; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2401; VI-NEXT: .LBB16_3: ; %endif 2402; VI-NEXT: s_mov_b32 s3, 0xf000 2403; VI-NEXT: s_mov_b32 s2, -1 2404; VI-NEXT: s_waitcnt vmcnt(0) 2405; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2406; VI-NEXT: s_endpgm 2407; VI-NEXT: .LBB16_4: 2408; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 2409; VI-NEXT: s_branch .LBB16_2 2410; 2411; GFX9-LABEL: mul64_in_branch: 2412; GFX9: ; %bb.0: ; %entry 2413; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2414; GFX9-NEXT: s_mov_b64 s[0:1], 0 2415; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2416; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 2417; GFX9-NEXT: s_cbranch_scc0 .LBB16_3 2418; GFX9-NEXT: ; %bb.1: ; %else 2419; GFX9-NEXT: s_mul_i32 s2, s12, s15 2420; GFX9-NEXT: s_mul_hi_u32 s3, s12, s14 2421; GFX9-NEXT: s_add_i32 s2, s3, s2 2422; GFX9-NEXT: s_mul_i32 s3, s13, s14 2423; GFX9-NEXT: s_add_i32 s3, s2, s3 2424; GFX9-NEXT: s_mul_i32 s2, s12, s14 2425; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] 2426; GFX9-NEXT: s_cbranch_vccnz .LBB16_4 2427; GFX9-NEXT: .LBB16_2: ; %if 2428; GFX9-NEXT: s_mov_b32 s3, 0xf000 2429; GFX9-NEXT: s_mov_b32 s2, -1 2430; GFX9-NEXT: s_mov_b32 s0, s10 2431; GFX9-NEXT: s_mov_b32 s1, s11 2432; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 2433; GFX9-NEXT: s_branch .LBB16_5 2434; GFX9-NEXT: .LBB16_3: 2435; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3 2436; GFX9-NEXT: s_branch .LBB16_2 2437; GFX9-NEXT: .LBB16_4: 2438; GFX9-NEXT: v_mov_b32_e32 v0, s2 2439; GFX9-NEXT: v_mov_b32_e32 v1, s3 2440; GFX9-NEXT: .LBB16_5: ; %endif 2441; GFX9-NEXT: s_mov_b32 s11, 0xf000 2442; GFX9-NEXT: s_mov_b32 s10, -1 2443; GFX9-NEXT: s_waitcnt vmcnt(0) 2444; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2445; GFX9-NEXT: s_endpgm 2446; 2447; GFX10-LABEL: mul64_in_branch: 2448; GFX10: ; %bb.0: ; %entry 2449; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2450; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2451; GFX10-NEXT: s_cmp_lg_u64 s[12:13], 0 2452; GFX10-NEXT: s_cbranch_scc0 .LBB16_3 2453; GFX10-NEXT: ; %bb.1: ; %else 2454; GFX10-NEXT: s_mul_i32 s0, s12, s15 2455; GFX10-NEXT: s_mul_hi_u32 s1, s12, s14 2456; GFX10-NEXT: s_mul_i32 s2, s13, s14 2457; GFX10-NEXT: s_add_i32 s0, s1, s0 2458; GFX10-NEXT: s_add_i32 s1, s0, s2 2459; GFX10-NEXT: s_mul_i32 s0, s12, s14 2460; GFX10-NEXT: s_cbranch_execnz .LBB16_4 2461; GFX10-NEXT: .LBB16_2: ; %if 2462; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2463; GFX10-NEXT: s_mov_b32 s2, -1 2464; GFX10-NEXT: s_mov_b32 s0, s10 2465; GFX10-NEXT: s_mov_b32 s1, s11 2466; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 2467; GFX10-NEXT: s_branch .LBB16_5 2468; GFX10-NEXT: .LBB16_3: 2469; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1 2470; GFX10-NEXT: s_branch .LBB16_2 2471; GFX10-NEXT: .LBB16_4: 2472; GFX10-NEXT: v_mov_b32_e32 v0, s0 2473; GFX10-NEXT: v_mov_b32_e32 v1, s1 2474; GFX10-NEXT: .LBB16_5: ; %endif 2475; GFX10-NEXT: s_mov_b32 s11, 0x31016000 2476; GFX10-NEXT: s_mov_b32 s10, -1 2477; GFX10-NEXT: s_waitcnt vmcnt(0) 2478; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 2479; GFX10-NEXT: s_endpgm 2480; 2481; GFX11-LABEL: mul64_in_branch: 2482; GFX11: ; %bb.0: ; %entry 2483; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2484; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2485; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 2486; GFX11-NEXT: s_cbranch_scc0 .LBB16_3 2487; GFX11-NEXT: ; %bb.1: ; %else 2488; GFX11-NEXT: s_mul_i32 s7, s4, s7 2489; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 2490; GFX11-NEXT: s_mul_i32 s5, s5, s6 2491; GFX11-NEXT: s_add_i32 s7, s8, s7 2492; GFX11-NEXT: s_mul_i32 s4, s4, s6 2493; GFX11-NEXT: s_add_i32 s5, s7, s5 2494; GFX11-NEXT: s_cbranch_execnz .LBB16_4 2495; GFX11-NEXT: .LBB16_2: ; %if 2496; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2497; GFX11-NEXT: s_mov_b32 s6, -1 2498; GFX11-NEXT: s_mov_b32 s4, s2 2499; GFX11-NEXT: s_mov_b32 s5, s3 2500; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 2501; GFX11-NEXT: s_branch .LBB16_5 2502; GFX11-NEXT: .LBB16_3: 2503; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 2504; GFX11-NEXT: s_branch .LBB16_2 2505; GFX11-NEXT: .LBB16_4: 2506; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2507; GFX11-NEXT: .LBB16_5: ; %endif 2508; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2509; GFX11-NEXT: s_mov_b32 s2, -1 2510; GFX11-NEXT: s_waitcnt vmcnt(0) 2511; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2512; GFX11-NEXT: s_endpgm 2513; 2514; GFX12-LABEL: mul64_in_branch: 2515; GFX12: ; %bb.0: ; %entry 2516; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2517; GFX12-NEXT: s_wait_kmcnt 0x0 2518; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 2519; GFX12-NEXT: s_cbranch_scc0 .LBB16_3 2520; GFX12-NEXT: ; %bb.1: ; %else 2521; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] 2522; GFX12-NEXT: s_cbranch_execnz .LBB16_4 2523; GFX12-NEXT: .LBB16_2: ; %if 2524; GFX12-NEXT: s_mov_b32 s7, 0x31016000 2525; GFX12-NEXT: s_mov_b32 s6, -1 2526; GFX12-NEXT: s_mov_b32 s4, s2 2527; GFX12-NEXT: s_mov_b32 s5, s3 2528; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null 2529; GFX12-NEXT: s_branch .LBB16_5 2530; GFX12-NEXT: .LBB16_3: 2531; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 2532; GFX12-NEXT: s_branch .LBB16_2 2533; GFX12-NEXT: .LBB16_4: 2534; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2535; GFX12-NEXT: .LBB16_5: ; %endif 2536; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2537; GFX12-NEXT: s_mov_b32 s2, -1 2538; GFX12-NEXT: s_wait_loadcnt 0x0 2539; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null 2540; GFX12-NEXT: s_endpgm 2541; 2542; EG-LABEL: mul64_in_branch: 2543; EG: ; %bb.0: ; %entry 2544; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] 2545; EG-NEXT: JUMP @3 POP:1 2546; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[] 2547; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 2548; EG-NEXT: JUMP @8 POP:1 2549; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] 2550; EG-NEXT: TEX 0 @12 2551; EG-NEXT: POP @8 POP:1 2552; EG-NEXT: ALU 1, @35, KC0[], KC1[] 2553; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2554; EG-NEXT: CF_END 2555; EG-NEXT: PAD 2556; EG-NEXT: Fetch clause starting at 12: 2557; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 2558; EG-NEXT: ALU clause starting at 14: 2559; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, 2560; EG-NEXT: MOV * T1.W, literal.x, 2561; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 2562; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 2563; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 2564; EG-NEXT: ALU clause starting at 19: 2565; EG-NEXT: MOV T0.W, KC0[2].W, 2566; EG-NEXT: MOV * T1.W, KC0[3].Z, 2567; EG-NEXT: MOV T2.W, KC0[3].Y, 2568; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 2569; EG-NEXT: MOV T1.W, KC0[3].X, 2570; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, 2571; EG-NEXT: ADD_INT T3.W, PS, T0.X, 2572; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, 2573; EG-NEXT: ADD_INT T0.Y, PV.W, PS, 2574; EG-NEXT: MOV T1.W, literal.x, 2575; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, 2576; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2577; EG-NEXT: ALU clause starting at 31: 2578; EG-NEXT: MOV T0.W, KC0[2].Y, 2579; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 2580; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2581; EG-NEXT: ALU clause starting at 34: 2582; EG-NEXT: MOV * T0.X, KC0[2].Z, 2583; EG-NEXT: ALU clause starting at 35: 2584; EG-NEXT: LSHR * T1.X, T0.W, literal.x, 2585; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2586entry: 2587 %0 = icmp eq i64 %a, 0 2588 br i1 %0, label %if, label %else 2589 2590if: 2591 %1 = load i64, ptr addrspace(1) %in 2592 br label %endif 2593 2594else: 2595 %2 = mul i64 %a, %b 2596 br label %endif 2597 2598endif: 2599 %3 = phi i64 [%1, %if], [%2, %else] 2600 store i64 %3, ptr addrspace(1) %out 2601 ret void 2602} 2603 2604define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { 2605; SI-LABEL: s_mul_i128: 2606; SI: ; %bb.0: ; %entry 2607; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x13 2608; SI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x1f 2609; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2610; SI-NEXT: s_mov_b32 s3, 0xf000 2611; SI-NEXT: s_mov_b32 s2, -1 2612; SI-NEXT: s_waitcnt lgkmcnt(0) 2613; SI-NEXT: v_mov_b32_e32 v0, s10 2614; SI-NEXT: v_mul_hi_u32 v0, s12, v0 2615; SI-NEXT: v_mov_b32_e32 v1, s8 2616; SI-NEXT: v_mul_hi_u32 v1, s14, v1 2617; SI-NEXT: s_mul_i32 s4, s12, s11 2618; SI-NEXT: s_mul_i32 s5, s13, s10 2619; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 2620; SI-NEXT: v_add_i32_e32 v0, vcc, s5, v0 2621; SI-NEXT: s_mul_i32 s5, s14, s9 2622; SI-NEXT: s_mul_i32 s4, s12, s10 2623; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 2624; SI-NEXT: s_mul_i32 s5, s15, s8 2625; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1 2626; SI-NEXT: s_mul_i32 s5, s14, s8 2627; SI-NEXT: v_mov_b32_e32 v2, s4 2628; SI-NEXT: v_add_i32_e32 v2, vcc, s5, v2 2629; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 2630; SI-NEXT: v_mov_b32_e32 v1, s12 2631; SI-NEXT: v_mul_hi_u32 v5, s8, v1 2632; SI-NEXT: v_mul_hi_u32 v1, s9, v1 2633; SI-NEXT: v_mov_b32_e32 v3, s13 2634; SI-NEXT: v_mul_hi_u32 v4, s8, v3 2635; SI-NEXT: s_mul_i32 s5, s9, s12 2636; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5 2637; SI-NEXT: s_mul_i32 s4, s8, s13 2638; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 2639; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v5 2640; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 2641; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 2642; SI-NEXT: v_mul_hi_u32 v3, s9, v3 2643; SI-NEXT: v_addc_u32_e64 v5, s[4:5], 0, 0, vcc 2644; SI-NEXT: s_mul_i32 s4, s9, s13 2645; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4 2646; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2647; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2648; SI-NEXT: s_mul_i32 s4, s8, s12 2649; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc 2650; SI-NEXT: v_mov_b32_e32 v0, s4 2651; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2652; SI-NEXT: s_endpgm 2653; 2654; VI-LABEL: s_mul_i128: 2655; VI: ; %bb.0: ; %entry 2656; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c 2657; VI-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c 2658; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2659; VI-NEXT: v_mov_b32_e32 v5, 0 2660; VI-NEXT: s_mov_b32 s3, 0xf000 2661; VI-NEXT: s_waitcnt lgkmcnt(0) 2662; VI-NEXT: v_mov_b32_e32 v0, s10 2663; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s12, v0, 0 2664; VI-NEXT: s_mul_i32 s4, s12, s11 2665; VI-NEXT: v_mov_b32_e32 v6, s12 2666; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 2667; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v6, 0 2668; VI-NEXT: s_mul_i32 s6, s13, s10 2669; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 2670; VI-NEXT: v_mov_b32_e32 v4, v1 2671; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v6, v[4:5] 2672; VI-NEXT: v_mov_b32_e32 v8, s8 2673; VI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], s14, v8, v[2:3] 2674; VI-NEXT: v_mov_b32_e32 v3, v7 2675; VI-NEXT: v_mov_b32_e32 v7, v5 2676; VI-NEXT: v_mov_b32_e32 v8, s13 2677; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s8, v8, v[6:7] 2678; VI-NEXT: s_mul_i32 s6, s15, s8 2679; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v2 2680; VI-NEXT: v_mov_b32_e32 v2, v5 2681; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 2682; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 2683; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s9, v8, v[2:3] 2684; VI-NEXT: s_mul_i32 s6, s14, s9 2685; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v6 2686; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 2687; VI-NEXT: s_mov_b32 s2, -1 2688; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2689; VI-NEXT: v_mov_b32_e32 v1, v4 2690; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2691; VI-NEXT: s_endpgm 2692; 2693; GFX9-LABEL: s_mul_i128: 2694; GFX9: ; %bb.0: ; %entry 2695; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4c 2696; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x7c 2697; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2698; GFX9-NEXT: s_mov_b32 s3, 0xf000 2699; GFX9-NEXT: s_mov_b32 s2, -1 2700; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2701; GFX9-NEXT: s_mul_i32 s4, s12, s11 2702; GFX9-NEXT: s_mul_hi_u32 s5, s12, s10 2703; GFX9-NEXT: s_mul_i32 s6, s14, s9 2704; GFX9-NEXT: s_mul_hi_u32 s7, s14, s8 2705; GFX9-NEXT: s_add_i32 s4, s5, s4 2706; GFX9-NEXT: s_mul_i32 s5, s13, s10 2707; GFX9-NEXT: s_add_i32 s6, s7, s6 2708; GFX9-NEXT: s_mul_i32 s7, s15, s8 2709; GFX9-NEXT: s_add_i32 s4, s4, s5 2710; GFX9-NEXT: s_mul_i32 s5, s12, s10 2711; GFX9-NEXT: s_add_i32 s6, s6, s7 2712; GFX9-NEXT: s_mul_i32 s7, s14, s8 2713; GFX9-NEXT: s_add_u32 s7, s7, s5 2714; GFX9-NEXT: s_addc_u32 s6, s6, s4 2715; GFX9-NEXT: s_mul_i32 s14, s9, s12 2716; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 2717; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 2718; GFX9-NEXT: s_add_u32 s14, s14, s15 2719; GFX9-NEXT: s_mul_i32 s5, s8, s13 2720; GFX9-NEXT: s_addc_u32 s11, s11, 0 2721; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 2722; GFX9-NEXT: s_add_u32 s5, s5, s14 2723; GFX9-NEXT: s_addc_u32 s10, s10, 0 2724; GFX9-NEXT: s_add_u32 s10, s11, s10 2725; GFX9-NEXT: s_addc_u32 s11, 0, 0 2726; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 2727; GFX9-NEXT: s_mul_i32 s9, s9, s13 2728; GFX9-NEXT: s_add_u32 s9, s9, s10 2729; GFX9-NEXT: s_addc_u32 s10, s14, s11 2730; GFX9-NEXT: s_mov_b32 s4, 0 2731; GFX9-NEXT: s_add_u32 s9, s9, s7 2732; GFX9-NEXT: s_addc_u32 s10, s10, s6 2733; GFX9-NEXT: s_mul_i32 s6, s8, s12 2734; GFX9-NEXT: s_mov_b32 s7, s4 2735; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 2736; GFX9-NEXT: v_mov_b32_e32 v0, s4 2737; GFX9-NEXT: v_mov_b32_e32 v1, s5 2738; GFX9-NEXT: v_mov_b32_e32 v2, s9 2739; GFX9-NEXT: v_mov_b32_e32 v3, s10 2740; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2741; GFX9-NEXT: s_endpgm 2742; 2743; GFX10-LABEL: s_mul_i128: 2744; GFX10: ; %bb.0: ; %entry 2745; GFX10-NEXT: s_clause 0x2 2746; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4c 2747; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x7c 2748; GFX10-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x24 2749; GFX10-NEXT: s_mov_b32 s6, 0 2750; GFX10-NEXT: s_mov_b32 s5, s6 2751; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2752; GFX10-NEXT: s_mul_i32 s3, s8, s3 2753; GFX10-NEXT: s_mul_hi_u32 s4, s8, s2 2754; GFX10-NEXT: s_mul_i32 s14, s10, s1 2755; GFX10-NEXT: s_mul_hi_u32 s15, s10, s0 2756; GFX10-NEXT: s_mul_i32 s7, s9, s2 2757; GFX10-NEXT: s_mul_i32 s11, s11, s0 2758; GFX10-NEXT: s_add_i32 s3, s4, s3 2759; GFX10-NEXT: s_add_i32 s4, s15, s14 2760; GFX10-NEXT: s_mul_i32 s2, s8, s2 2761; GFX10-NEXT: s_mul_i32 s10, s10, s0 2762; GFX10-NEXT: s_add_i32 s3, s3, s7 2763; GFX10-NEXT: s_add_i32 s4, s4, s11 2764; GFX10-NEXT: s_mul_i32 s19, s1, s8 2765; GFX10-NEXT: s_mul_hi_u32 s20, s0, s8 2766; GFX10-NEXT: s_add_u32 s2, s10, s2 2767; GFX10-NEXT: s_mul_hi_u32 s18, s1, s8 2768; GFX10-NEXT: s_addc_u32 s3, s4, s3 2769; GFX10-NEXT: s_mul_i32 s17, s0, s9 2770; GFX10-NEXT: s_add_u32 s4, s19, s20 2771; GFX10-NEXT: s_mul_hi_u32 s16, s0, s9 2772; GFX10-NEXT: s_mul_hi_u32 s21, s1, s9 2773; GFX10-NEXT: s_mul_i32 s1, s1, s9 2774; GFX10-NEXT: s_addc_u32 s9, s18, 0 2775; GFX10-NEXT: s_add_u32 s7, s17, s4 2776; GFX10-NEXT: s_addc_u32 s10, s16, 0 2777; GFX10-NEXT: s_mul_i32 s4, s0, s8 2778; GFX10-NEXT: s_add_u32 s0, s9, s10 2779; GFX10-NEXT: s_addc_u32 s8, 0, 0 2780; GFX10-NEXT: s_add_u32 s0, s1, s0 2781; GFX10-NEXT: s_addc_u32 s1, s21, s8 2782; GFX10-NEXT: s_add_u32 s2, s0, s2 2783; GFX10-NEXT: s_addc_u32 s3, s1, s3 2784; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[6:7] 2785; GFX10-NEXT: v_mov_b32_e32 v2, s2 2786; GFX10-NEXT: v_mov_b32_e32 v0, s0 2787; GFX10-NEXT: v_mov_b32_e32 v1, s1 2788; GFX10-NEXT: v_mov_b32_e32 v3, s3 2789; GFX10-NEXT: s_mov_b32 s15, 0x31016000 2790; GFX10-NEXT: s_mov_b32 s14, -1 2791; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2792; GFX10-NEXT: s_endpgm 2793; 2794; GFX11-LABEL: s_mul_i128: 2795; GFX11: ; %bb.0: ; %entry 2796; GFX11-NEXT: s_clause 0x2 2797; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x4c 2798; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c 2799; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 2800; GFX11-NEXT: s_mov_b32 s6, 0 2801; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2802; GFX11-NEXT: s_mov_b32 s13, s6 2803; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2804; GFX11-NEXT: s_mul_i32 s3, s8, s3 2805; GFX11-NEXT: s_mul_hi_u32 s7, s8, s2 2806; GFX11-NEXT: s_mul_i32 s14, s10, s1 2807; GFX11-NEXT: s_mul_hi_u32 s15, s10, s0 2808; GFX11-NEXT: s_mul_i32 s12, s9, s2 2809; GFX11-NEXT: s_mul_i32 s11, s11, s0 2810; GFX11-NEXT: s_add_i32 s3, s7, s3 2811; GFX11-NEXT: s_add_i32 s7, s15, s14 2812; GFX11-NEXT: s_mul_i32 s2, s8, s2 2813; GFX11-NEXT: s_mul_i32 s10, s10, s0 2814; GFX11-NEXT: s_add_i32 s3, s3, s12 2815; GFX11-NEXT: s_add_i32 s7, s7, s11 2816; GFX11-NEXT: s_mul_i32 s19, s1, s8 2817; GFX11-NEXT: s_mul_hi_u32 s20, s0, s8 2818; GFX11-NEXT: s_add_u32 s2, s10, s2 2819; GFX11-NEXT: s_mul_hi_u32 s18, s1, s8 2820; GFX11-NEXT: s_addc_u32 s3, s7, s3 2821; GFX11-NEXT: s_mul_i32 s17, s0, s9 2822; GFX11-NEXT: s_add_u32 s7, s19, s20 2823; GFX11-NEXT: s_mul_hi_u32 s16, s0, s9 2824; GFX11-NEXT: s_mul_hi_u32 s21, s1, s9 2825; GFX11-NEXT: s_mul_i32 s1, s1, s9 2826; GFX11-NEXT: s_addc_u32 s9, s18, 0 2827; GFX11-NEXT: s_add_u32 s7, s17, s7 2828; GFX11-NEXT: s_addc_u32 s10, s16, 0 2829; GFX11-NEXT: s_mul_i32 s12, s0, s8 2830; GFX11-NEXT: s_add_u32 s0, s9, s10 2831; GFX11-NEXT: s_addc_u32 s8, 0, 0 2832; GFX11-NEXT: s_add_u32 s0, s1, s0 2833; GFX11-NEXT: s_addc_u32 s1, s21, s8 2834; GFX11-NEXT: s_add_u32 s2, s0, s2 2835; GFX11-NEXT: s_addc_u32 s3, s1, s3 2836; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[6:7] 2837; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2838; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 2839; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 2840; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2841; GFX11-NEXT: s_mov_b32 s6, -1 2842; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 2843; GFX11-NEXT: s_endpgm 2844; 2845; GFX12-LABEL: s_mul_i128: 2846; GFX12: ; %bb.0: ; %entry 2847; GFX12-NEXT: s_clause 0x1 2848; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c 2849; GFX12-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c 2850; GFX12-NEXT: s_mov_b32 s3, 0 2851; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2852; GFX12-NEXT: s_mov_b32 s7, s3 2853; GFX12-NEXT: s_mov_b32 s5, s3 2854; GFX12-NEXT: s_mov_b32 s17, s3 2855; GFX12-NEXT: s_mov_b32 s19, s3 2856; GFX12-NEXT: s_mov_b32 s24, s3 2857; GFX12-NEXT: s_wait_kmcnt 0x0 2858; GFX12-NEXT: s_mov_b32 s2, s8 2859; GFX12-NEXT: s_mov_b32 s6, s12 2860; GFX12-NEXT: s_mov_b32 s4, s13 2861; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3] 2862; GFX12-NEXT: s_mul_u64 s[20:21], s[4:5], s[2:3] 2863; GFX12-NEXT: s_mov_b32 s2, s23 2864; GFX12-NEXT: s_mov_b32 s16, s9 2865; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] 2866; GFX12-NEXT: s_add_nc_u64 s[12:13], s[20:21], s[2:3] 2867; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] 2868; GFX12-NEXT: s_mov_b32 s2, s13 2869; GFX12-NEXT: s_mov_b32 s13, s3 2870; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] 2871; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13] 2872; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] 2873; GFX12-NEXT: s_mov_b32 s18, s7 2874; GFX12-NEXT: s_mov_b32 s23, s3 2875; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] 2876; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] 2877; GFX12-NEXT: s_mov_b32 s25, s6 2878; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3] 2879; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25] 2880; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] 2881; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 2882; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 2883; GFX12-NEXT: s_mov_b32 s3, 0x31016000 2884; GFX12-NEXT: s_mov_b32 s2, -1 2885; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null 2886; GFX12-NEXT: s_endpgm 2887; 2888; EG-LABEL: s_mul_i128: 2889; EG: ; %bb.0: ; %entry 2890; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] 2891; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2892; EG-NEXT: CF_END 2893; EG-NEXT: PAD 2894; EG-NEXT: ALU clause starting at 4: 2895; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X, 2896; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X, 2897; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W, 2898; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y, 2899; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W, 2900; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X, 2901; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W, 2902; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X, 2903; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y, 2904; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W, 2905; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W, 2906; EG-NEXT: ADD_INT T2.W, T2.Y, PS, 2907; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X, 2908; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z, 2909; EG-NEXT: ADDC_UINT T3.W, PS, PV.W, 2910; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z, 2911; EG-NEXT: ADD_INT T2.X, T2.X, PS, 2912; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W, 2913; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W, 2914; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212 2915; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W, 2916; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z, 2917; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z, 2918; EG-NEXT: ADD_INT T1.Z, PV.Y, PS, 2919; EG-NEXT: ADD_INT T0.W, PV.X, T0.W, 2920; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y, 2921; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W, 2922; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS, 2923; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y, 2924; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X, 2925; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X, 2926; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122 2927; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2928; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 2929; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2930; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 2931; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 2932; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z, 2933; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W, 2934; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2935; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2936; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W, 2937entry: 2938 %mul = mul i128 %a, %b 2939 store i128 %mul, ptr addrspace(1) %out 2940 ret void 2941} 2942 2943define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 2944; SI-LABEL: v_mul_i128: 2945; SI: ; %bb.0: ; %entry 2946; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 2947; SI-NEXT: s_mov_b32 s7, 0xf000 2948; SI-NEXT: s_mov_b32 s6, 0 2949; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 2950; SI-NEXT: v_mov_b32_e32 v9, 0 2951; SI-NEXT: s_waitcnt lgkmcnt(0) 2952; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 2953; SI-NEXT: s_mov_b64 s[0:1], s[2:3] 2954; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2955; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 2956; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 2957; SI-NEXT: s_waitcnt vmcnt(0) 2958; SI-NEXT: v_mul_lo_u32 v3, v4, v3 2959; SI-NEXT: v_mul_hi_u32 v10, v4, v2 2960; SI-NEXT: v_mul_lo_u32 v12, v6, v1 2961; SI-NEXT: v_mul_hi_u32 v13, v6, v0 2962; SI-NEXT: v_mul_lo_u32 v17, v1, v4 2963; SI-NEXT: v_mul_hi_u32 v18, v0, v4 2964; SI-NEXT: v_mul_lo_u32 v11, v5, v2 2965; SI-NEXT: v_mul_lo_u32 v7, v7, v0 2966; SI-NEXT: v_mul_hi_u32 v16, v1, v4 2967; SI-NEXT: v_mul_lo_u32 v15, v0, v5 2968; SI-NEXT: v_mul_hi_u32 v14, v0, v5 2969; SI-NEXT: v_mul_hi_u32 v19, v1, v5 2970; SI-NEXT: v_mul_lo_u32 v5, v1, v5 2971; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 2972; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 2973; SI-NEXT: v_mul_lo_u32 v2, v4, v2 2974; SI-NEXT: v_mul_lo_u32 v6, v6, v0 2975; SI-NEXT: v_mul_lo_u32 v0, v0, v4 2976; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 2977; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc 2978; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 2979; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2980; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 2981; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc 2982; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 2983; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 2984; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 2985; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc 2986; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2987; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc 2988; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2989; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 2990; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 2991; SI-NEXT: s_endpgm 2992; 2993; VI-LABEL: v_mul_i128: 2994; VI: ; %bb.0: ; %entry 2995; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 2996; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 2997; VI-NEXT: v_mov_b32_e32 v10, 0 2998; VI-NEXT: s_waitcnt lgkmcnt(0) 2999; VI-NEXT: v_mov_b32_e32 v1, s1 3000; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3001; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3002; VI-NEXT: v_mov_b32_e32 v3, s3 3003; VI-NEXT: v_add_u32_e32 v12, vcc, s2, v2 3004; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc 3005; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 3006; VI-NEXT: flat_load_dwordx4 v[4:7], v[12:13] 3007; VI-NEXT: s_waitcnt vmcnt(0) 3008; VI-NEXT: v_mul_lo_u32 v3, v4, v3 3009; VI-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v4, v2, 0 3010; VI-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 3011; VI-NEXT: v_mul_lo_u32 v2, v5, v2 3012; VI-NEXT: v_add_u32_e32 v3, vcc, v15, v3 3013; VI-NEXT: v_add_u32_e32 v15, vcc, v3, v2 3014; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v1, v4, v[9:10] 3015; VI-NEXT: v_mov_b32_e32 v4, v3 3016; VI-NEXT: v_mov_b32_e32 v3, v10 3017; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v5, v[2:3] 3018; VI-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v6, v0, v[14:15] 3019; VI-NEXT: v_add_u32_e32 v3, vcc, v4, v3 3020; VI-NEXT: v_addc_u32_e64 v4, s[0:1], 0, 0, vcc 3021; VI-NEXT: v_mul_lo_u32 v0, v7, v0 3022; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, v[3:4] 3023; VI-NEXT: v_mul_lo_u32 v1, v6, v1 3024; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v10 3025; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 3026; VI-NEXT: v_add_u32_e32 v10, vcc, v3, v9 3027; VI-NEXT: v_addc_u32_e32 v11, vcc, v4, v0, vcc 3028; VI-NEXT: v_mov_b32_e32 v9, v2 3029; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 3030; VI-NEXT: s_endpgm 3031; 3032; GFX9-LABEL: v_mul_i128: 3033; GFX9: ; %bb.0: ; %entry 3034; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 3035; GFX9-NEXT: v_lshlrev_b32_e32 v12, 4, v0 3036; GFX9-NEXT: v_mov_b32_e32 v11, 0 3037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3038; GFX9-NEXT: global_load_dwordx4 v[0:3], v12, s[0:1] 3039; GFX9-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] 3040; GFX9-NEXT: s_waitcnt vmcnt(0) 3041; GFX9-NEXT: v_mul_lo_u32 v10, v5, v2 3042; GFX9-NEXT: v_mul_lo_u32 v13, v4, v3 3043; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v2, 0 3044; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 3045; GFX9-NEXT: v_add3_u32 v9, v9, v13, v10 3046; GFX9-NEXT: v_mul_lo_u32 v13, v6, v1 3047; GFX9-NEXT: v_mov_b32_e32 v10, v3 3048; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] 3049; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v0, v[8:9] 3050; GFX9-NEXT: v_mov_b32_e32 v10, v4 3051; GFX9-NEXT: v_mov_b32_e32 v4, v11 3052; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] 3053; GFX9-NEXT: v_mul_lo_u32 v0, v7, v0 3054; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v4 3055; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, 0, vcc 3056; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[10:11] 3057; GFX9-NEXT: v_add3_u32 v0, v0, v9, v13 3058; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 3059; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc 3060; GFX9-NEXT: global_store_dwordx4 v12, v[2:5], s[2:3] 3061; GFX9-NEXT: s_endpgm 3062; 3063; GFX10-LABEL: v_mul_i128: 3064; GFX10: ; %bb.0: ; %entry 3065; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 3066; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0 3067; GFX10-NEXT: v_mov_b32_e32 v10, 0 3068; GFX10-NEXT: s_waitcnt lgkmcnt(0) 3069; GFX10-NEXT: s_clause 0x1 3070; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] 3071; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] 3072; GFX10-NEXT: s_waitcnt vmcnt(0) 3073; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 3074; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2 3075; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 3076; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] 3077; GFX10-NEXT: v_mov_b32_e32 v14, v12 3078; GFX10-NEXT: v_mov_b32_e32 v12, v10 3079; GFX10-NEXT: v_mad_u64_u32 v[9:10], s0, v0, v5, v[11:12] 3080; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 3081; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 3082; GFX10-NEXT: v_mul_lo_u32 v12, v6, v1 3083; GFX10-NEXT: v_mov_b32_e32 v4, v10 3084; GFX10-NEXT: v_add3_u32 v3, v3, v11, v15 3085; GFX10-NEXT: v_add_co_u32 v10, s0, v14, v4 3086; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s0, 0, 0, s0 3087; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] 3088; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[10:11] 3089; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12 3090; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 3091; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 3092; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] 3093; GFX10-NEXT: s_endpgm 3094; 3095; GFX11-LABEL: v_mul_i128: 3096; GFX11: ; %bb.0: ; %entry 3097; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 3098; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3099; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 3100; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 3101; GFX11-NEXT: s_waitcnt lgkmcnt(0) 3102; GFX11-NEXT: s_clause 0x1 3103; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1] 3104; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3] 3105; GFX11-NEXT: s_waitcnt vmcnt(0) 3106; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 3107; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2 3108; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 3109; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3110; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] 3111; GFX11-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10 3112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) 3113; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v0, v5, v[11:12] 3114; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v4, v2, 0 3115; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 3116; GFX11-NEXT: v_mov_b32_e32 v2, v10 3117; GFX11-NEXT: v_mul_lo_u32 v10, v7, v0 3118; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) 3119; GFX11-NEXT: v_add3_u32 v12, v12, v3, v14 3120; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 3121; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3122; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 3123; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v6, v0, v[11:12] 3124; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3125; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] 3126; GFX11-NEXT: v_add3_u32 v0, v10, v14, v4 3127; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3128; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 3129; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo 3130; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3] 3131; GFX11-NEXT: s_endpgm 3132; 3133; GFX12-LABEL: v_mul_i128: 3134; GFX12: ; %bb.0: ; %entry 3135; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 3136; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3137; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 3138; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 3139; GFX12-NEXT: s_wait_kmcnt 0x0 3140; GFX12-NEXT: s_clause 0x1 3141; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1] 3142; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3] 3143; GFX12-NEXT: s_wait_loadcnt 0x0 3144; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 3145; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2 3146; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0 3147; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 3148; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] 3149; GFX12-NEXT: v_mov_b32_e32 v14, v12 3150; GFX12-NEXT: v_mov_b32_e32 v12, v10 3151; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 3152; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12] 3153; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3 3154; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 3155; GFX12-NEXT: v_mul_lo_u32 v12, v6, v1 3156; GFX12-NEXT: v_mov_b32_e32 v4, v10 3157; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 3158; GFX12-NEXT: v_add3_u32 v3, v3, v11, v15 3159; GFX12-NEXT: v_add_co_u32 v10, s0, v14, v4 3160; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) 3161; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 3162; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] 3163; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3164; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] 3165; GFX12-NEXT: v_add3_u32 v3, v7, v3, v12 3166; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3167; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 3168; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 3169; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] 3170; GFX12-NEXT: s_endpgm 3171; 3172; EG-LABEL: v_mul_i128: 3173; EG: ; %bb.0: ; %entry 3174; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 3175; EG-NEXT: TEX 1 @6 3176; EG-NEXT: ALU 41, @14, KC0[], KC1[] 3177; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 3178; EG-NEXT: CF_END 3179; EG-NEXT: PAD 3180; EG-NEXT: Fetch clause starting at 6: 3181; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1 3182; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 3183; EG-NEXT: ALU clause starting at 10: 3184; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 3185; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 3186; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 3187; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 3188; EG-NEXT: ALU clause starting at 14: 3189; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y, 3190; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y, 3191; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X, 3192; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z, 3193; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X, 3194; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y, 3195; EG-NEXT: MULHI * T3.W, T2.Z, T0.X, 3196; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y, 3197; EG-NEXT: MULHI * T4.X, T2.X, T0.Z, 3198; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X, 3199; EG-NEXT: MULHI * T4.Y, T0.X, T2.X, 3200; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 3201; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y, 3202; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y, 3203; EG-NEXT: ADDC_UINT T5.W, PS, PV.W, 3204; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W, 3205; EG-NEXT: ADD_INT T4.X, T4.X, PS, 3206; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z, 3207; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W, 3208; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z, 3209; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X, 3210; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z, 3211; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z, 3212; EG-NEXT: ADD_INT T2.Z, PV.Y, PS, 3213; EG-NEXT: ADD_INT T0.W, PV.X, T3.X, 3214; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z, 3215; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W, 3216; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS, 3217; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y, 3218; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X, 3219; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X, 3220; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y, 3221; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3222; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 3223; EG-NEXT: ADD_INT T0.W, PV.W, PS, 3224; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 3225; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 3226; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z, 3227; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W, 3228; EG-NEXT: LSHR T1.X, T1.X, literal.x, 3229; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X, 3230; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 3231entry: 3232 %tid = call i32 @llvm.amdgcn.workitem.id.x() 3233 %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid 3234 %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 3235 %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 3236 %a = load i128, ptr addrspace(1) %gep.a 3237 %b = load i128, ptr addrspace(1) %gep.b 3238 %mul = mul i128 %a, %b 3239 store i128 %mul, ptr addrspace(1) %gep.out 3240 ret void 3241} 3242 3243define i32 @mul_pow2_plus_1(i32 %val) { 3244; SI-LABEL: mul_pow2_plus_1: 3245; SI: ; %bb.0: 3246; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3247; SI-NEXT: v_mul_lo_u32 v0, v0, 9 3248; SI-NEXT: s_setpc_b64 s[30:31] 3249; 3250; VI-LABEL: mul_pow2_plus_1: 3251; VI: ; %bb.0: 3252; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3253; VI-NEXT: v_mul_lo_u32 v0, v0, 9 3254; VI-NEXT: s_setpc_b64 s[30:31] 3255; 3256; GFX9-LABEL: mul_pow2_plus_1: 3257; GFX9: ; %bb.0: 3258; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3259; GFX9-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3260; GFX9-NEXT: s_setpc_b64 s[30:31] 3261; 3262; GFX10-LABEL: mul_pow2_plus_1: 3263; GFX10: ; %bb.0: 3264; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3265; GFX10-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3266; GFX10-NEXT: s_setpc_b64 s[30:31] 3267; 3268; GFX11-LABEL: mul_pow2_plus_1: 3269; GFX11: ; %bb.0: 3270; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3271; GFX11-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3272; GFX11-NEXT: s_setpc_b64 s[30:31] 3273; 3274; GFX12-LABEL: mul_pow2_plus_1: 3275; GFX12: ; %bb.0: 3276; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 3277; GFX12-NEXT: s_wait_expcnt 0x0 3278; GFX12-NEXT: s_wait_samplecnt 0x0 3279; GFX12-NEXT: s_wait_bvhcnt 0x0 3280; GFX12-NEXT: s_wait_kmcnt 0x0 3281; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, v0 3282; GFX12-NEXT: s_setpc_b64 s[30:31] 3283; 3284; EG-LABEL: mul_pow2_plus_1: 3285; EG: ; %bb.0: 3286; EG-NEXT: CF_END 3287; EG-NEXT: PAD 3288 %mul = mul i32 %val, 9 3289 ret i32 %mul 3290} 3291 3292declare i32 @llvm.amdgcn.workitem.id.x() #1 3293 3294attributes #0 = { nounwind } 3295attributes #1 = { nounwind readnone} 3296