1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s 8 9; mul24 and mad24 are affected 10 11define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 12; SI-LABEL: test_mul_v2i32: 13; SI: ; %bb.0: ; %entry 14; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s6, -1 17; SI-NEXT: s_mov_b32 s10, s6 18; SI-NEXT: s_mov_b32 s11, s7 19; SI-NEXT: s_waitcnt lgkmcnt(0) 20; SI-NEXT: s_mov_b32 s8, s2 21; SI-NEXT: s_mov_b32 s9, s3 22; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 23; SI-NEXT: s_mov_b32 s4, s0 24; SI-NEXT: s_mov_b32 s5, s1 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_mul_lo_u32 v1, v1, v3 27; SI-NEXT: v_mul_lo_u32 v0, v0, v2 28; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 29; SI-NEXT: s_endpgm 30; 31; VI-LABEL: test_mul_v2i32: 32; VI: ; %bb.0: ; %entry 33; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 34; VI-NEXT: s_mov_b32 s7, 0xf000 35; VI-NEXT: s_mov_b32 s6, -1 36; VI-NEXT: s_mov_b32 s10, s6 37; VI-NEXT: s_mov_b32 s11, s7 38; VI-NEXT: s_waitcnt lgkmcnt(0) 39; VI-NEXT: s_mov_b32 s8, s2 40; VI-NEXT: s_mov_b32 s9, s3 41; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 42; VI-NEXT: s_mov_b32 s4, s0 43; VI-NEXT: s_mov_b32 s5, s1 44; VI-NEXT: s_waitcnt vmcnt(0) 45; VI-NEXT: v_mul_lo_u32 v1, v1, v3 46; VI-NEXT: v_mul_lo_u32 v0, v0, v2 47; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 48; VI-NEXT: s_endpgm 49; 50; GFX9-LABEL: test_mul_v2i32: 51; GFX9: ; %bb.0: ; %entry 52; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 53; GFX9-NEXT: s_mov_b32 s7, 0xf000 54; GFX9-NEXT: s_mov_b32 s6, -1 55; GFX9-NEXT: s_mov_b32 s10, s6 56; GFX9-NEXT: s_mov_b32 s11, s7 57; GFX9-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-NEXT: s_mov_b32 s8, s2 59; GFX9-NEXT: s_mov_b32 s9, s3 60; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 61; GFX9-NEXT: s_mov_b32 s4, s0 62; GFX9-NEXT: s_mov_b32 s5, s1 63; GFX9-NEXT: s_waitcnt vmcnt(0) 64; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 65; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2 66; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 67; GFX9-NEXT: s_endpgm 68; 69; GFX10-LABEL: test_mul_v2i32: 70; GFX10: ; %bb.0: ; %entry 71; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 72; GFX10-NEXT: s_mov_b32 s6, -1 73; GFX10-NEXT: s_mov_b32 s7, 0x31016000 74; GFX10-NEXT: s_mov_b32 s10, s6 75; GFX10-NEXT: s_mov_b32 s11, s7 76; GFX10-NEXT: s_waitcnt lgkmcnt(0) 77; GFX10-NEXT: s_mov_b32 s8, s2 78; GFX10-NEXT: s_mov_b32 s9, s3 79; GFX10-NEXT: s_mov_b32 s4, s0 80; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 81; GFX10-NEXT: s_mov_b32 s5, s1 82; GFX10-NEXT: s_waitcnt vmcnt(0) 83; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3 84; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2 85; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 86; GFX10-NEXT: s_endpgm 87; 88; GFX11-LABEL: test_mul_v2i32: 89; GFX11: ; %bb.0: ; %entry 90; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 91; GFX11-NEXT: s_mov_b32 s6, -1 92; GFX11-NEXT: s_mov_b32 s7, 0x31016000 93; GFX11-NEXT: s_mov_b32 s10, s6 94; GFX11-NEXT: s_mov_b32 s11, s7 95; GFX11-NEXT: s_waitcnt lgkmcnt(0) 96; GFX11-NEXT: s_mov_b32 s8, s2 97; GFX11-NEXT: s_mov_b32 s9, s3 98; GFX11-NEXT: s_mov_b32 s4, s0 99; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 100; GFX11-NEXT: s_mov_b32 s5, s1 101; GFX11-NEXT: s_waitcnt vmcnt(0) 102; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3 103; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2 104; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 105; GFX11-NEXT: s_nop 0 106; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 107; GFX11-NEXT: s_endpgm 108; 109; EG-LABEL: test_mul_v2i32: 110; EG: ; %bb.0: ; %entry 111; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 112; EG-NEXT: TEX 0 @6 113; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 114; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 115; EG-NEXT: CF_END 116; EG-NEXT: PAD 117; EG-NEXT: Fetch clause starting at 6: 118; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 119; EG-NEXT: ALU clause starting at 8: 120; EG-NEXT: MOV * T0.X, KC0[2].Z, 121; EG-NEXT: ALU clause starting at 9: 122; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T0.W, 123; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 124; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Z, 125; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 126entry: 127 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 128 %a = load <2 x i32>, ptr addrspace(1) %in 129 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 130 %result = mul <2 x i32> %a, %b 131 store <2 x i32> %result, ptr addrspace(1) %out 132 ret void 133} 134 135define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 136; SI-LABEL: v_mul_v4i32: 137; SI: ; %bb.0: ; %entry 138; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 139; SI-NEXT: s_mov_b32 s7, 0xf000 140; SI-NEXT: s_mov_b32 s6, -1 141; SI-NEXT: s_mov_b32 s10, s6 142; SI-NEXT: s_mov_b32 s11, s7 143; SI-NEXT: s_waitcnt lgkmcnt(0) 144; SI-NEXT: s_mov_b32 s8, s2 145; SI-NEXT: s_mov_b32 s9, s3 146; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 147; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 148; SI-NEXT: s_mov_b32 s4, s0 149; SI-NEXT: s_mov_b32 s5, s1 150; SI-NEXT: s_waitcnt vmcnt(0) 151; SI-NEXT: v_mul_lo_u32 v3, v3, v7 152; SI-NEXT: v_mul_lo_u32 v2, v2, v6 153; SI-NEXT: v_mul_lo_u32 v1, v1, v5 154; SI-NEXT: v_mul_lo_u32 v0, v0, v4 155; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 156; SI-NEXT: s_endpgm 157; 158; VI-LABEL: v_mul_v4i32: 159; VI: ; %bb.0: ; %entry 160; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 161; VI-NEXT: s_mov_b32 s7, 0xf000 162; VI-NEXT: s_mov_b32 s6, -1 163; VI-NEXT: s_mov_b32 s10, s6 164; VI-NEXT: s_mov_b32 s11, s7 165; VI-NEXT: s_waitcnt lgkmcnt(0) 166; VI-NEXT: s_mov_b32 s8, s2 167; VI-NEXT: s_mov_b32 s9, s3 168; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 169; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 170; VI-NEXT: s_mov_b32 s4, s0 171; VI-NEXT: s_mov_b32 s5, s1 172; VI-NEXT: s_waitcnt vmcnt(0) 173; VI-NEXT: v_mul_lo_u32 v3, v3, v7 174; VI-NEXT: v_mul_lo_u32 v2, v2, v6 175; VI-NEXT: v_mul_lo_u32 v1, v1, v5 176; VI-NEXT: v_mul_lo_u32 v0, v0, v4 177; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 178; VI-NEXT: s_endpgm 179; 180; GFX9-LABEL: v_mul_v4i32: 181; GFX9: ; %bb.0: ; %entry 182; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 183; GFX9-NEXT: s_mov_b32 s7, 0xf000 184; GFX9-NEXT: s_mov_b32 s6, -1 185; GFX9-NEXT: s_mov_b32 s10, s6 186; GFX9-NEXT: s_mov_b32 s11, s7 187; GFX9-NEXT: s_waitcnt lgkmcnt(0) 188; GFX9-NEXT: s_mov_b32 s8, s2 189; GFX9-NEXT: s_mov_b32 s9, s3 190; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 191; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 192; GFX9-NEXT: s_mov_b32 s4, s0 193; GFX9-NEXT: s_mov_b32 s5, s1 194; GFX9-NEXT: s_waitcnt vmcnt(0) 195; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7 196; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6 197; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5 198; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 199; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 200; GFX9-NEXT: s_endpgm 201; 202; GFX10-LABEL: v_mul_v4i32: 203; GFX10: ; %bb.0: ; %entry 204; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 205; GFX10-NEXT: s_mov_b32 s6, -1 206; GFX10-NEXT: s_mov_b32 s7, 0x31016000 207; GFX10-NEXT: s_mov_b32 s10, s6 208; GFX10-NEXT: s_mov_b32 s11, s7 209; GFX10-NEXT: s_waitcnt lgkmcnt(0) 210; GFX10-NEXT: s_mov_b32 s8, s2 211; GFX10-NEXT: s_mov_b32 s9, s3 212; GFX10-NEXT: s_clause 0x1 213; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 214; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 215; GFX10-NEXT: s_mov_b32 s4, s0 216; GFX10-NEXT: s_mov_b32 s5, s1 217; GFX10-NEXT: s_waitcnt vmcnt(0) 218; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7 219; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6 220; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5 221; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4 222; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 223; GFX10-NEXT: s_endpgm 224; 225; GFX11-LABEL: v_mul_v4i32: 226; GFX11: ; %bb.0: ; %entry 227; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 228; GFX11-NEXT: s_mov_b32 s6, -1 229; GFX11-NEXT: s_mov_b32 s7, 0x31016000 230; GFX11-NEXT: s_mov_b32 s10, s6 231; GFX11-NEXT: s_mov_b32 s11, s7 232; GFX11-NEXT: s_waitcnt lgkmcnt(0) 233; GFX11-NEXT: s_mov_b32 s8, s2 234; GFX11-NEXT: s_mov_b32 s9, s3 235; GFX11-NEXT: s_clause 0x1 236; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 237; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16 238; GFX11-NEXT: s_mov_b32 s4, s0 239; GFX11-NEXT: s_mov_b32 s5, s1 240; GFX11-NEXT: s_waitcnt vmcnt(0) 241; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7 242; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6 243; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5 244; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4 245; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0 246; GFX11-NEXT: s_nop 0 247; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 248; GFX11-NEXT: s_endpgm 249; 250; EG-LABEL: v_mul_v4i32: 251; EG: ; %bb.0: ; %entry 252; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 253; EG-NEXT: TEX 1 @6 254; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 255; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1 256; EG-NEXT: CF_END 257; EG-NEXT: PAD 258; EG-NEXT: Fetch clause starting at 6: 259; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 260; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 261; EG-NEXT: ALU clause starting at 10: 262; EG-NEXT: MOV * T0.X, KC0[2].Z, 263; EG-NEXT: ALU clause starting at 11: 264; EG-NEXT: MULLO_INT * T0.W, T0.W, T1.W, 265; EG-NEXT: MULLO_INT * T0.Z, T0.Z, T1.Z, 266; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.Y, 267; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 268; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 269; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 270entry: 271 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 272 %a = load <4 x i32>, ptr addrspace(1) %in 273 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 274 %result = mul <4 x i32> %a, %b 275 store <4 x i32> %result, ptr addrspace(1) %out 276 ret void 277} 278 279define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, i64 %b) { 280; SI-LABEL: s_trunc_i64_mul_to_i32: 281; SI: ; %bb.0: ; %entry 282; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 283; SI-NEXT: s_waitcnt lgkmcnt(0) 284; SI-NEXT: s_load_dword s7, s[0:1], 0xd 285; SI-NEXT: s_mov_b32 s3, 0xf000 286; SI-NEXT: s_mov_b32 s2, -1 287; SI-NEXT: s_mov_b32 s0, s4 288; SI-NEXT: s_waitcnt lgkmcnt(0) 289; SI-NEXT: s_mul_i32 s4, s7, s6 290; SI-NEXT: s_mov_b32 s1, s5 291; SI-NEXT: v_mov_b32_e32 v0, s4 292; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 293; SI-NEXT: s_endpgm 294; 295; VI-LABEL: s_trunc_i64_mul_to_i32: 296; VI: ; %bb.0: ; %entry 297; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 298; VI-NEXT: s_waitcnt lgkmcnt(0) 299; VI-NEXT: s_load_dword s7, s[0:1], 0x34 300; VI-NEXT: s_mov_b32 s3, 0xf000 301; VI-NEXT: s_mov_b32 s2, -1 302; VI-NEXT: s_mov_b32 s0, s4 303; VI-NEXT: s_waitcnt lgkmcnt(0) 304; VI-NEXT: s_mul_i32 s4, s7, s6 305; VI-NEXT: s_mov_b32 s1, s5 306; VI-NEXT: v_mov_b32_e32 v0, s4 307; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 308; VI-NEXT: s_endpgm 309; 310; GFX9-LABEL: s_trunc_i64_mul_to_i32: 311; GFX9: ; %bb.0: ; %entry 312; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 314; GFX9-NEXT: s_load_dword s7, s[0:1], 0x34 315; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 316; GFX9-NEXT: s_mov_b32 s3, 0xf000 317; GFX9-NEXT: s_mov_b32 s2, -1 318; GFX9-NEXT: s_mov_b32 s0, s4 319; GFX9-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-NEXT: s_mul_i32 s4, s7, s6 321; GFX9-NEXT: s_mov_b32 s1, s5 322; GFX9-NEXT: v_mov_b32_e32 v0, s4 323; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 324; GFX9-NEXT: s_endpgm 325; 326; GFX10-LABEL: s_trunc_i64_mul_to_i32: 327; GFX10: ; %bb.0: ; %entry 328; GFX10-NEXT: s_clause 0x1 329; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 330; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 331; GFX10-NEXT: s_waitcnt lgkmcnt(0) 332; GFX10-NEXT: s_mov_b32 s7, 0x31016000 333; GFX10-NEXT: s_mul_i32 s0, s2, s6 334; GFX10-NEXT: s_mov_b32 s6, -1 335; GFX10-NEXT: v_mov_b32_e32 v0, s0 336; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 337; GFX10-NEXT: s_endpgm 338; 339; GFX11-LABEL: s_trunc_i64_mul_to_i32: 340; GFX11: ; %bb.0: ; %entry 341; GFX11-NEXT: s_clause 0x1 342; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 343; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 344; GFX11-NEXT: s_waitcnt lgkmcnt(0) 345; GFX11-NEXT: s_mov_b32 s7, 0x31016000 346; GFX11-NEXT: s_mul_i32 s0, s0, s6 347; GFX11-NEXT: s_mov_b32 s6, -1 348; GFX11-NEXT: v_mov_b32_e32 v0, s0 349; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 350; GFX11-NEXT: s_nop 0 351; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 352; GFX11-NEXT: s_endpgm 353; 354; EG-LABEL: s_trunc_i64_mul_to_i32: 355; EG: ; %bb.0: ; %entry 356; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 357; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 358; EG-NEXT: CF_END 359; EG-NEXT: PAD 360; EG-NEXT: ALU clause starting at 4: 361; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 362; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 363; EG-NEXT: MULLO_INT * T1.X, KC0[3].Y, KC0[2].W, 364entry: 365 %mul = mul i64 %b, %a 366 %trunc = trunc i64 %mul to i32 367 store i32 %trunc, ptr addrspace(1) %out, align 8 368 ret void 369} 370 371define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 372; SI-LABEL: v_trunc_i64_mul_to_i32: 373; SI: ; %bb.0: ; %entry 374; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 375; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 376; SI-NEXT: s_mov_b32 s3, 0xf000 377; SI-NEXT: s_mov_b32 s2, -1 378; SI-NEXT: s_mov_b32 s14, s2 379; SI-NEXT: s_waitcnt lgkmcnt(0) 380; SI-NEXT: s_mov_b32 s12, s6 381; SI-NEXT: s_mov_b32 s13, s7 382; SI-NEXT: s_mov_b32 s15, s3 383; SI-NEXT: s_mov_b32 s10, s2 384; SI-NEXT: s_mov_b32 s11, s3 385; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 386; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 387; SI-NEXT: s_mov_b32 s0, s4 388; SI-NEXT: s_mov_b32 s1, s5 389; SI-NEXT: s_waitcnt vmcnt(0) 390; SI-NEXT: v_mul_lo_u32 v0, v1, v0 391; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 392; SI-NEXT: s_endpgm 393; 394; VI-LABEL: v_trunc_i64_mul_to_i32: 395; VI: ; %bb.0: ; %entry 396; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 397; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 398; VI-NEXT: s_mov_b32 s3, 0xf000 399; VI-NEXT: s_mov_b32 s2, -1 400; VI-NEXT: s_mov_b32 s14, s2 401; VI-NEXT: s_waitcnt lgkmcnt(0) 402; VI-NEXT: s_mov_b32 s12, s6 403; VI-NEXT: s_mov_b32 s13, s7 404; VI-NEXT: s_mov_b32 s15, s3 405; VI-NEXT: s_mov_b32 s10, s2 406; VI-NEXT: s_mov_b32 s11, s3 407; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 408; VI-NEXT: buffer_load_dword v1, off, s[8:11], 0 409; VI-NEXT: s_mov_b32 s0, s4 410; VI-NEXT: s_mov_b32 s1, s5 411; VI-NEXT: s_waitcnt vmcnt(0) 412; VI-NEXT: v_mul_lo_u32 v0, v1, v0 413; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 414; VI-NEXT: s_endpgm 415; 416; GFX9-LABEL: v_trunc_i64_mul_to_i32: 417; GFX9: ; %bb.0: ; %entry 418; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 419; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 420; GFX9-NEXT: s_mov_b32 s3, 0xf000 421; GFX9-NEXT: s_mov_b32 s2, -1 422; GFX9-NEXT: s_mov_b32 s14, s2 423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 424; GFX9-NEXT: s_mov_b32 s12, s6 425; GFX9-NEXT: s_mov_b32 s13, s7 426; GFX9-NEXT: s_mov_b32 s15, s3 427; GFX9-NEXT: s_mov_b32 s10, s2 428; GFX9-NEXT: s_mov_b32 s11, s3 429; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 430; GFX9-NEXT: buffer_load_dword v1, off, s[8:11], 0 431; GFX9-NEXT: s_mov_b32 s0, s4 432; GFX9-NEXT: s_mov_b32 s1, s5 433; GFX9-NEXT: s_waitcnt vmcnt(0) 434; GFX9-NEXT: v_mul_lo_u32 v0, v1, v0 435; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 436; GFX9-NEXT: s_endpgm 437; 438; GFX10-LABEL: v_trunc_i64_mul_to_i32: 439; GFX10: ; %bb.0: ; %entry 440; GFX10-NEXT: s_clause 0x1 441; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 442; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 443; GFX10-NEXT: s_mov_b32 s2, -1 444; GFX10-NEXT: s_mov_b32 s3, 0x31016000 445; GFX10-NEXT: s_mov_b32 s14, s2 446; GFX10-NEXT: s_mov_b32 s15, s3 447; GFX10-NEXT: s_mov_b32 s10, s2 448; GFX10-NEXT: s_mov_b32 s11, s3 449; GFX10-NEXT: s_waitcnt lgkmcnt(0) 450; GFX10-NEXT: s_mov_b32 s12, s6 451; GFX10-NEXT: s_mov_b32 s13, s7 452; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 453; GFX10-NEXT: buffer_load_dword v1, off, s[8:11], 0 454; GFX10-NEXT: s_mov_b32 s0, s4 455; GFX10-NEXT: s_mov_b32 s1, s5 456; GFX10-NEXT: s_waitcnt vmcnt(0) 457; GFX10-NEXT: v_mul_lo_u32 v0, v1, v0 458; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX10-NEXT: s_endpgm 460; 461; GFX11-LABEL: v_trunc_i64_mul_to_i32: 462; GFX11: ; %bb.0: ; %entry 463; GFX11-NEXT: s_clause 0x1 464; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 465; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 466; GFX11-NEXT: s_mov_b32 s10, -1 467; GFX11-NEXT: s_mov_b32 s11, 0x31016000 468; GFX11-NEXT: s_mov_b32 s14, s10 469; GFX11-NEXT: s_mov_b32 s15, s11 470; GFX11-NEXT: s_mov_b32 s2, s10 471; GFX11-NEXT: s_mov_b32 s3, s11 472; GFX11-NEXT: s_waitcnt lgkmcnt(0) 473; GFX11-NEXT: s_mov_b32 s12, s6 474; GFX11-NEXT: s_mov_b32 s13, s7 475; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 476; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0 477; GFX11-NEXT: s_mov_b32 s8, s4 478; GFX11-NEXT: s_mov_b32 s9, s5 479; GFX11-NEXT: s_waitcnt vmcnt(0) 480; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0 481; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 482; GFX11-NEXT: s_nop 0 483; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 484; GFX11-NEXT: s_endpgm 485; 486; EG-LABEL: v_trunc_i64_mul_to_i32: 487; EG: ; %bb.0: ; %entry 488; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 489; EG-NEXT: TEX 1 @6 490; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 491; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 492; EG-NEXT: CF_END 493; EG-NEXT: PAD 494; EG-NEXT: Fetch clause starting at 6: 495; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 496; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 497; EG-NEXT: ALU clause starting at 10: 498; EG-NEXT: MOV T0.X, KC0[2].Z, 499; EG-NEXT: MOV * T1.X, KC0[2].W, 500; EG-NEXT: ALU clause starting at 12: 501; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 502; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 503; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 504entry: 505 %a = load i64, ptr addrspace(1) %aptr, align 8 506 %b = load i64, ptr addrspace(1) %bptr, align 8 507 %mul = mul i64 %b, %a 508 %trunc = trunc i64 %mul to i32 509 store i32 %trunc, ptr addrspace(1) %out, align 8 510 ret void 511} 512 513; This 64-bit multiply should just use MUL_HI and MUL_LO, since the top 514; 32-bits of both arguments are sign bits. 515 516define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { 517; SI-LABEL: mul64_sext_c: 518; SI: ; %bb.0: ; %entry 519; SI-NEXT: s_load_dword s4, s[0:1], 0xb 520; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 521; SI-NEXT: v_mov_b32_e32 v0, 0x50 522; SI-NEXT: s_mov_b32 s3, 0xf000 523; SI-NEXT: s_mov_b32 s2, -1 524; SI-NEXT: s_waitcnt lgkmcnt(0) 525; SI-NEXT: v_mul_hi_i32 v1, s4, v0 526; SI-NEXT: s_mulk_i32 s4, 0x50 527; SI-NEXT: v_mov_b32_e32 v0, s4 528; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 529; SI-NEXT: s_endpgm 530; 531; VI-LABEL: mul64_sext_c: 532; VI: ; %bb.0: ; %entry 533; VI-NEXT: s_load_dword s2, s[0:1], 0x2c 534; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; VI-NEXT: v_mov_b32_e32 v0, 0x50 536; VI-NEXT: s_waitcnt lgkmcnt(0) 537; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0 538; VI-NEXT: s_mov_b32 s3, 0xf000 539; VI-NEXT: s_mov_b32 s2, -1 540; VI-NEXT: s_nop 2 541; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 542; VI-NEXT: s_endpgm 543; 544; GFX9-LABEL: mul64_sext_c: 545; GFX9: ; %bb.0: ; %entry 546; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 547; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 548; GFX9-NEXT: s_mov_b32 s7, 0xf000 549; GFX9-NEXT: s_mov_b32 s6, -1 550; GFX9-NEXT: s_waitcnt lgkmcnt(0) 551; GFX9-NEXT: s_mul_hi_i32 s0, s2, 0x50 552; GFX9-NEXT: s_mulk_i32 s2, 0x50 553; GFX9-NEXT: v_mov_b32_e32 v0, s2 554; GFX9-NEXT: v_mov_b32_e32 v1, s0 555; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 556; GFX9-NEXT: s_endpgm 557; 558; GFX10-LABEL: mul64_sext_c: 559; GFX10: ; %bb.0: ; %entry 560; GFX10-NEXT: s_clause 0x1 561; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c 562; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 563; GFX10-NEXT: s_mov_b32 s7, 0x31016000 564; GFX10-NEXT: s_mov_b32 s6, -1 565; GFX10-NEXT: s_waitcnt lgkmcnt(0) 566; GFX10-NEXT: s_mul_i32 s0, s2, 0x50 567; GFX10-NEXT: s_mul_hi_i32 s1, s2, 0x50 568; GFX10-NEXT: v_mov_b32_e32 v0, s0 569; GFX10-NEXT: v_mov_b32_e32 v1, s1 570; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 571; GFX10-NEXT: s_endpgm 572; 573; GFX11-LABEL: mul64_sext_c: 574; GFX11: ; %bb.0: ; %entry 575; GFX11-NEXT: s_clause 0x1 576; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c 577; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 578; GFX11-NEXT: s_waitcnt lgkmcnt(0) 579; GFX11-NEXT: s_mul_i32 s3, s2, 0x50 580; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50 581; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 582; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 583; GFX11-NEXT: s_mov_b32 s3, 0x31016000 584; GFX11-NEXT: s_mov_b32 s2, -1 585; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 586; GFX11-NEXT: s_nop 0 587; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 588; GFX11-NEXT: s_endpgm 589; 590; EG-LABEL: mul64_sext_c: 591; EG: ; %bb.0: ; %entry 592; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 594; EG-NEXT: CF_END 595; EG-NEXT: PAD 596; EG-NEXT: ALU clause starting at 4: 597; EG-NEXT: MULHI_INT * T0.Y, KC0[2].Z, literal.x, 598; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 599; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 600; EG-NEXT: MULLO_INT * T0.X, KC0[2].Z, literal.y, 601; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 602entry: 603 %0 = sext i32 %in to i64 604 %1 = mul i64 %0, 80 605 store i64 %1, ptr addrspace(1) %out 606 ret void 607} 608 609define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1) %in) { 610; SI-LABEL: v_mul64_sext_c: 611; SI: ; %bb.0: ; %entry 612; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 613; SI-NEXT: s_mov_b32 s7, 0xf000 614; SI-NEXT: s_mov_b32 s6, -1 615; SI-NEXT: s_mov_b32 s10, s6 616; SI-NEXT: s_mov_b32 s11, s7 617; SI-NEXT: s_waitcnt lgkmcnt(0) 618; SI-NEXT: s_mov_b32 s8, s2 619; SI-NEXT: s_mov_b32 s9, s3 620; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 621; SI-NEXT: s_movk_i32 s2, 0x50 622; SI-NEXT: s_mov_b32 s4, s0 623; SI-NEXT: s_mov_b32 s5, s1 624; SI-NEXT: s_waitcnt vmcnt(0) 625; SI-NEXT: v_mul_hi_i32 v1, v0, s2 626; SI-NEXT: v_mul_lo_u32 v0, v0, s2 627; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 628; SI-NEXT: s_endpgm 629; 630; VI-LABEL: v_mul64_sext_c: 631; VI: ; %bb.0: ; %entry 632; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 633; VI-NEXT: s_mov_b32 s7, 0xf000 634; VI-NEXT: s_mov_b32 s6, -1 635; VI-NEXT: s_mov_b32 s10, s6 636; VI-NEXT: s_mov_b32 s11, s7 637; VI-NEXT: s_waitcnt lgkmcnt(0) 638; VI-NEXT: s_mov_b32 s8, s2 639; VI-NEXT: s_mov_b32 s9, s3 640; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 641; VI-NEXT: s_movk_i32 s2, 0x50 642; VI-NEXT: s_mov_b32 s4, s0 643; VI-NEXT: s_mov_b32 s5, s1 644; VI-NEXT: s_waitcnt vmcnt(0) 645; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0 646; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 647; VI-NEXT: s_endpgm 648; 649; GFX9-LABEL: v_mul64_sext_c: 650; GFX9: ; %bb.0: ; %entry 651; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 652; GFX9-NEXT: s_mov_b32 s7, 0xf000 653; GFX9-NEXT: s_mov_b32 s6, -1 654; GFX9-NEXT: s_mov_b32 s10, s6 655; GFX9-NEXT: s_mov_b32 s11, s7 656; GFX9-NEXT: s_waitcnt lgkmcnt(0) 657; GFX9-NEXT: s_mov_b32 s8, s2 658; GFX9-NEXT: s_mov_b32 s9, s3 659; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 660; GFX9-NEXT: s_movk_i32 s2, 0x50 661; GFX9-NEXT: s_mov_b32 s4, s0 662; GFX9-NEXT: s_mov_b32 s5, s1 663; GFX9-NEXT: s_waitcnt vmcnt(0) 664; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 665; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 666; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 667; GFX9-NEXT: s_endpgm 668; 669; GFX10-LABEL: v_mul64_sext_c: 670; GFX10: ; %bb.0: ; %entry 671; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 672; GFX10-NEXT: s_mov_b32 s6, -1 673; GFX10-NEXT: s_mov_b32 s7, 0x31016000 674; GFX10-NEXT: s_mov_b32 s10, s6 675; GFX10-NEXT: s_mov_b32 s11, s7 676; GFX10-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-NEXT: s_mov_b32 s8, s2 678; GFX10-NEXT: s_mov_b32 s9, s3 679; GFX10-NEXT: s_mov_b32 s4, s0 680; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 681; GFX10-NEXT: s_mov_b32 s5, s1 682; GFX10-NEXT: s_waitcnt vmcnt(0) 683; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0 684; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0 685; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 686; GFX10-NEXT: s_endpgm 687; 688; GFX11-LABEL: v_mul64_sext_c: 689; GFX11: ; %bb.0: ; %entry 690; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 691; GFX11-NEXT: s_mov_b32 s6, -1 692; GFX11-NEXT: s_mov_b32 s7, 0x31016000 693; GFX11-NEXT: s_mov_b32 s10, s6 694; GFX11-NEXT: s_mov_b32 s11, s7 695; GFX11-NEXT: s_waitcnt lgkmcnt(0) 696; GFX11-NEXT: s_mov_b32 s8, s2 697; GFX11-NEXT: s_mov_b32 s9, s3 698; GFX11-NEXT: s_mov_b32 s4, s0 699; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 700; GFX11-NEXT: s_mov_b32 s5, s1 701; GFX11-NEXT: s_waitcnt vmcnt(0) 702; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0 703; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0 704; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 705; GFX11-NEXT: s_nop 0 706; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 707; GFX11-NEXT: s_endpgm 708; 709; EG-LABEL: v_mul64_sext_c: 710; EG: ; %bb.0: ; %entry 711; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 712; EG-NEXT: TEX 0 @6 713; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 714; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 715; EG-NEXT: CF_END 716; EG-NEXT: PAD 717; EG-NEXT: Fetch clause starting at 6: 718; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 719; EG-NEXT: ALU clause starting at 8: 720; EG-NEXT: MOV * T0.X, KC0[2].Z, 721; EG-NEXT: ALU clause starting at 9: 722; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 723; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00) 724; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 725; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 726; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43) 727entry: 728 %val = load i32, ptr addrspace(1) %in, align 4 729 %ext = sext i32 %val to i64 730 %mul = mul i64 %ext, 80 731 store i64 %mul, ptr addrspace(1) %out, align 8 732 ret void 733} 734 735define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %in) { 736; SI-LABEL: v_mul64_sext_inline_imm: 737; SI: ; %bb.0: ; %entry 738; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 739; SI-NEXT: s_mov_b32 s7, 0xf000 740; SI-NEXT: s_mov_b32 s6, -1 741; SI-NEXT: s_mov_b32 s10, s6 742; SI-NEXT: s_mov_b32 s11, s7 743; SI-NEXT: s_waitcnt lgkmcnt(0) 744; SI-NEXT: s_mov_b32 s8, s2 745; SI-NEXT: s_mov_b32 s9, s3 746; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 747; SI-NEXT: s_mov_b32 s4, s0 748; SI-NEXT: s_mov_b32 s5, s1 749; SI-NEXT: s_waitcnt vmcnt(0) 750; SI-NEXT: v_mul_hi_i32 v1, v0, 9 751; SI-NEXT: v_mul_lo_u32 v0, v0, 9 752; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 753; SI-NEXT: s_endpgm 754; 755; VI-LABEL: v_mul64_sext_inline_imm: 756; VI: ; %bb.0: ; %entry 757; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 758; VI-NEXT: s_mov_b32 s7, 0xf000 759; VI-NEXT: s_mov_b32 s6, -1 760; VI-NEXT: s_mov_b32 s10, s6 761; VI-NEXT: s_mov_b32 s11, s7 762; VI-NEXT: s_waitcnt lgkmcnt(0) 763; VI-NEXT: s_mov_b32 s8, s2 764; VI-NEXT: s_mov_b32 s9, s3 765; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 766; VI-NEXT: s_mov_b32 s4, s0 767; VI-NEXT: s_mov_b32 s5, s1 768; VI-NEXT: s_waitcnt vmcnt(0) 769; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0 770; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 771; VI-NEXT: s_endpgm 772; 773; GFX9-LABEL: v_mul64_sext_inline_imm: 774; GFX9: ; %bb.0: ; %entry 775; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 776; GFX9-NEXT: s_mov_b32 s7, 0xf000 777; GFX9-NEXT: s_mov_b32 s6, -1 778; GFX9-NEXT: s_mov_b32 s10, s6 779; GFX9-NEXT: s_mov_b32 s11, s7 780; GFX9-NEXT: s_waitcnt lgkmcnt(0) 781; GFX9-NEXT: s_mov_b32 s8, s2 782; GFX9-NEXT: s_mov_b32 s9, s3 783; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 784; GFX9-NEXT: s_mov_b32 s4, s0 785; GFX9-NEXT: s_mov_b32 s5, s1 786; GFX9-NEXT: s_waitcnt vmcnt(0) 787; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9 788; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9 789; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 790; GFX9-NEXT: s_endpgm 791; 792; GFX10-LABEL: v_mul64_sext_inline_imm: 793; GFX10: ; %bb.0: ; %entry 794; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 795; GFX10-NEXT: s_mov_b32 s6, -1 796; GFX10-NEXT: s_mov_b32 s7, 0x31016000 797; GFX10-NEXT: s_mov_b32 s10, s6 798; GFX10-NEXT: s_mov_b32 s11, s7 799; GFX10-NEXT: s_waitcnt lgkmcnt(0) 800; GFX10-NEXT: s_mov_b32 s8, s2 801; GFX10-NEXT: s_mov_b32 s9, s3 802; GFX10-NEXT: s_mov_b32 s4, s0 803; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 804; GFX10-NEXT: s_mov_b32 s5, s1 805; GFX10-NEXT: s_waitcnt vmcnt(0) 806; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9 807; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9 808; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 809; GFX10-NEXT: s_endpgm 810; 811; GFX11-LABEL: v_mul64_sext_inline_imm: 812; GFX11: ; %bb.0: ; %entry 813; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 814; GFX11-NEXT: s_mov_b32 s6, -1 815; GFX11-NEXT: s_mov_b32 s7, 0x31016000 816; GFX11-NEXT: s_mov_b32 s10, s6 817; GFX11-NEXT: s_mov_b32 s11, s7 818; GFX11-NEXT: s_waitcnt lgkmcnt(0) 819; GFX11-NEXT: s_mov_b32 s8, s2 820; GFX11-NEXT: s_mov_b32 s9, s3 821; GFX11-NEXT: s_mov_b32 s4, s0 822; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 823; GFX11-NEXT: s_mov_b32 s5, s1 824; GFX11-NEXT: s_waitcnt vmcnt(0) 825; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9 826; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9 827; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 828; GFX11-NEXT: s_nop 0 829; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 830; GFX11-NEXT: s_endpgm 831; 832; EG-LABEL: v_mul64_sext_inline_imm: 833; EG: ; %bb.0: ; %entry 834; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 835; EG-NEXT: TEX 0 @6 836; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 837; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 838; EG-NEXT: CF_END 839; EG-NEXT: PAD 840; EG-NEXT: Fetch clause starting at 6: 841; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 842; EG-NEXT: ALU clause starting at 8: 843; EG-NEXT: MOV * T0.X, KC0[2].Z, 844; EG-NEXT: ALU clause starting at 9: 845; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 846; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 847; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 848; EG-NEXT: MULLO_INT * T0.X, T0.X, literal.y, 849; EG-NEXT: 2(2.802597e-45), 9(1.261169e-44) 850entry: 851 %val = load i32, ptr addrspace(1) %in, align 4 852 %ext = sext i32 %val to i64 853 %mul = mul i64 %ext, 9 854 store i64 %mul, ptr addrspace(1) %out, align 8 855 ret void 856} 857 858define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind { 859; SI-LABEL: s_mul_i32: 860; SI: ; %bb.0: ; %entry 861; SI-NEXT: s_load_dword s4, s[0:1], 0x13 862; SI-NEXT: s_load_dword s5, s[0:1], 0x1c 863; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 864; SI-NEXT: s_mov_b32 s3, 0xf000 865; SI-NEXT: s_mov_b32 s2, -1 866; SI-NEXT: s_waitcnt lgkmcnt(0) 867; SI-NEXT: s_mul_i32 s4, s4, s5 868; SI-NEXT: v_mov_b32_e32 v0, s4 869; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 870; SI-NEXT: s_endpgm 871; 872; VI-LABEL: s_mul_i32: 873; VI: ; %bb.0: ; %entry 874; VI-NEXT: s_load_dword s4, s[0:1], 0x4c 875; VI-NEXT: s_load_dword s5, s[0:1], 0x70 876; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 877; VI-NEXT: s_mov_b32 s3, 0xf000 878; VI-NEXT: s_mov_b32 s2, -1 879; VI-NEXT: s_waitcnt lgkmcnt(0) 880; VI-NEXT: s_mul_i32 s4, s4, s5 881; VI-NEXT: v_mov_b32_e32 v0, s4 882; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 883; VI-NEXT: s_endpgm 884; 885; GFX9-LABEL: s_mul_i32: 886; GFX9: ; %bb.0: ; %entry 887; GFX9-NEXT: s_load_dword s2, s[0:1], 0x4c 888; GFX9-NEXT: s_load_dword s3, s[0:1], 0x70 889; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 890; GFX9-NEXT: s_mov_b32 s7, 0xf000 891; GFX9-NEXT: s_mov_b32 s6, -1 892; GFX9-NEXT: s_waitcnt lgkmcnt(0) 893; GFX9-NEXT: s_mul_i32 s0, s2, s3 894; GFX9-NEXT: v_mov_b32_e32 v0, s0 895; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 896; GFX9-NEXT: s_endpgm 897; 898; GFX10-LABEL: s_mul_i32: 899; GFX10: ; %bb.0: ; %entry 900; GFX10-NEXT: s_clause 0x2 901; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 902; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 903; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 904; GFX10-NEXT: s_mov_b32 s7, 0x31016000 905; GFX10-NEXT: s_mov_b32 s6, -1 906; GFX10-NEXT: s_waitcnt lgkmcnt(0) 907; GFX10-NEXT: s_mul_i32 s0, s2, s3 908; GFX10-NEXT: v_mov_b32_e32 v0, s0 909; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 910; GFX10-NEXT: s_endpgm 911; 912; GFX11-LABEL: s_mul_i32: 913; GFX11: ; %bb.0: ; %entry 914; GFX11-NEXT: s_clause 0x2 915; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c 916; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 917; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 918; GFX11-NEXT: s_waitcnt lgkmcnt(0) 919; GFX11-NEXT: s_mul_i32 s2, s2, s3 920; GFX11-NEXT: s_mov_b32 s3, 0x31016000 921; GFX11-NEXT: v_mov_b32_e32 v0, s2 922; GFX11-NEXT: s_mov_b32 s2, -1 923; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 924; GFX11-NEXT: s_nop 0 925; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 926; GFX11-NEXT: s_endpgm 927; 928; EG-LABEL: s_mul_i32: 929; EG: ; %bb.0: ; %entry 930; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 931; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 932; EG-NEXT: CF_END 933; EG-NEXT: PAD 934; EG-NEXT: ALU clause starting at 4: 935; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 936; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 937; EG-NEXT: MULLO_INT * T1.X, KC0[4].Z, KC0[6].W, 938entry: 939 %mul = mul i32 %a, %b 940 store i32 %mul, ptr addrspace(1) %out, align 4 941 ret void 942} 943 944define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 945; SI-LABEL: v_mul_i32: 946; SI: ; %bb.0: ; %entry 947; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 948; SI-NEXT: s_mov_b32 s7, 0xf000 949; SI-NEXT: s_mov_b32 s6, -1 950; SI-NEXT: s_mov_b32 s10, s6 951; SI-NEXT: s_mov_b32 s11, s7 952; SI-NEXT: s_waitcnt lgkmcnt(0) 953; SI-NEXT: s_mov_b32 s8, s2 954; SI-NEXT: s_mov_b32 s9, s3 955; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 956; SI-NEXT: s_mov_b32 s4, s0 957; SI-NEXT: s_mov_b32 s5, s1 958; SI-NEXT: s_waitcnt vmcnt(0) 959; SI-NEXT: v_mul_lo_u32 v0, v0, v1 960; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 961; SI-NEXT: s_endpgm 962; 963; VI-LABEL: v_mul_i32: 964; VI: ; %bb.0: ; %entry 965; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 966; VI-NEXT: s_mov_b32 s7, 0xf000 967; VI-NEXT: s_mov_b32 s6, -1 968; VI-NEXT: s_mov_b32 s10, s6 969; VI-NEXT: s_mov_b32 s11, s7 970; VI-NEXT: s_waitcnt lgkmcnt(0) 971; VI-NEXT: s_mov_b32 s8, s2 972; VI-NEXT: s_mov_b32 s9, s3 973; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 974; VI-NEXT: s_mov_b32 s4, s0 975; VI-NEXT: s_mov_b32 s5, s1 976; VI-NEXT: s_waitcnt vmcnt(0) 977; VI-NEXT: v_mul_lo_u32 v0, v0, v1 978; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 979; VI-NEXT: s_endpgm 980; 981; GFX9-LABEL: v_mul_i32: 982; GFX9: ; %bb.0: ; %entry 983; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 984; GFX9-NEXT: s_mov_b32 s7, 0xf000 985; GFX9-NEXT: s_mov_b32 s6, -1 986; GFX9-NEXT: s_mov_b32 s10, s6 987; GFX9-NEXT: s_mov_b32 s11, s7 988; GFX9-NEXT: s_waitcnt lgkmcnt(0) 989; GFX9-NEXT: s_mov_b32 s8, s2 990; GFX9-NEXT: s_mov_b32 s9, s3 991; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 992; GFX9-NEXT: s_mov_b32 s4, s0 993; GFX9-NEXT: s_mov_b32 s5, s1 994; GFX9-NEXT: s_waitcnt vmcnt(0) 995; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1 996; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 997; GFX9-NEXT: s_endpgm 998; 999; GFX10-LABEL: v_mul_i32: 1000; GFX10: ; %bb.0: ; %entry 1001; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1002; GFX10-NEXT: s_mov_b32 s6, -1 1003; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1004; GFX10-NEXT: s_mov_b32 s10, s6 1005; GFX10-NEXT: s_mov_b32 s11, s7 1006; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX10-NEXT: s_mov_b32 s8, s2 1008; GFX10-NEXT: s_mov_b32 s9, s3 1009; GFX10-NEXT: s_mov_b32 s4, s0 1010; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1011; GFX10-NEXT: s_mov_b32 s5, s1 1012; GFX10-NEXT: s_waitcnt vmcnt(0) 1013; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1 1014; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 1015; GFX10-NEXT: s_endpgm 1016; 1017; GFX11-LABEL: v_mul_i32: 1018; GFX11: ; %bb.0: ; %entry 1019; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1020; GFX11-NEXT: s_mov_b32 s6, -1 1021; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1022; GFX11-NEXT: s_mov_b32 s10, s6 1023; GFX11-NEXT: s_mov_b32 s11, s7 1024; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX11-NEXT: s_mov_b32 s8, s2 1026; GFX11-NEXT: s_mov_b32 s9, s3 1027; GFX11-NEXT: s_mov_b32 s4, s0 1028; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 1029; GFX11-NEXT: s_mov_b32 s5, s1 1030; GFX11-NEXT: s_waitcnt vmcnt(0) 1031; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 1032; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 1033; GFX11-NEXT: s_nop 0 1034; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1035; GFX11-NEXT: s_endpgm 1036; 1037; EG-LABEL: v_mul_i32: 1038; EG: ; %bb.0: ; %entry 1039; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1040; EG-NEXT: TEX 0 @6 1041; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1042; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1043; EG-NEXT: CF_END 1044; EG-NEXT: PAD 1045; EG-NEXT: Fetch clause starting at 6: 1046; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1047; EG-NEXT: ALU clause starting at 8: 1048; EG-NEXT: MOV * T0.X, KC0[2].Z, 1049; EG-NEXT: ALU clause starting at 9: 1050; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 1051; EG-NEXT: MULLO_INT * T0.X, T0.X, T0.Y, 1052; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1053entry: 1054 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1055 %a = load i32, ptr addrspace(1) %in 1056 %b = load i32, ptr addrspace(1) %b_ptr 1057 %result = mul i32 %a, %b 1058 store i32 %result, ptr addrspace(1) %out 1059 ret void 1060} 1061 1062define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 x i32], i1 %b) nounwind { 1063; SI-LABEL: s_mul_i1: 1064; SI: ; %bb.0: ; %entry 1065; SI-NEXT: s_load_dword s4, s[0:1], 0x13 1066; SI-NEXT: s_load_dword s5, s[0:1], 0x1c 1067; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1068; SI-NEXT: s_mov_b32 s3, 0xf000 1069; SI-NEXT: s_mov_b32 s2, -1 1070; SI-NEXT: s_waitcnt lgkmcnt(0) 1071; SI-NEXT: s_mul_i32 s4, s4, s5 1072; SI-NEXT: s_and_b32 s4, s4, 1 1073; SI-NEXT: v_mov_b32_e32 v0, s4 1074; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1075; SI-NEXT: s_endpgm 1076; 1077; VI-LABEL: s_mul_i1: 1078; VI: ; %bb.0: ; %entry 1079; VI-NEXT: s_load_dword s4, s[0:1], 0x70 1080; VI-NEXT: s_load_dword s5, s[0:1], 0x4c 1081; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1082; VI-NEXT: s_mov_b32 s3, 0xf000 1083; VI-NEXT: s_mov_b32 s2, -1 1084; VI-NEXT: s_waitcnt lgkmcnt(0) 1085; VI-NEXT: v_mov_b32_e32 v0, s4 1086; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0 1087; VI-NEXT: v_and_b32_e32 v0, 1, v0 1088; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1089; VI-NEXT: s_endpgm 1090; 1091; GFX9-LABEL: s_mul_i1: 1092; GFX9: ; %bb.0: ; %entry 1093; GFX9-NEXT: s_load_dword s2, s[0:1], 0x70 1094; GFX9-NEXT: s_load_dword s3, s[0:1], 0x4c 1095; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1096; GFX9-NEXT: s_mov_b32 s7, 0xf000 1097; GFX9-NEXT: s_mov_b32 s6, -1 1098; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX9-NEXT: v_mov_b32_e32 v0, s2 1100; GFX9-NEXT: v_mul_lo_u16_e32 v0, s3, v0 1101; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1102; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 1103; GFX9-NEXT: s_endpgm 1104; 1105; GFX10-LABEL: s_mul_i1: 1106; GFX10: ; %bb.0: ; %entry 1107; GFX10-NEXT: s_clause 0x2 1108; GFX10-NEXT: s_load_dword s2, s[0:1], 0x4c 1109; GFX10-NEXT: s_load_dword s3, s[0:1], 0x70 1110; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 1111; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1112; GFX10-NEXT: s_mov_b32 s6, -1 1113; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX10-NEXT: v_mul_lo_u16 v0, s2, s3 1115; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 1116; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 1117; GFX10-NEXT: s_endpgm 1118; 1119; GFX11-LABEL: s_mul_i1: 1120; GFX11: ; %bb.0: ; %entry 1121; GFX11-NEXT: s_clause 0x2 1122; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c 1123; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70 1124; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 1125; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1126; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3 1127; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1128; GFX11-NEXT: s_mov_b32 s2, -1 1129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1130; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 1131; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0 1132; GFX11-NEXT: s_nop 0 1133; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1134; GFX11-NEXT: s_endpgm 1135; 1136; EG-LABEL: s_mul_i1: 1137; EG: ; %bb.0: ; %entry 1138; EG-NEXT: ALU 0, @10, KC0[], KC1[] 1139; EG-NEXT: TEX 1 @6 1140; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1141; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1142; EG-NEXT: CF_END 1143; EG-NEXT: PAD 1144; EG-NEXT: Fetch clause starting at 6: 1145; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3 1146; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3 1147; EG-NEXT: ALU clause starting at 10: 1148; EG-NEXT: MOV * T0.X, 0.0, 1149; EG-NEXT: ALU clause starting at 11: 1150; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1151; EG-NEXT: MULLO_INT * T0.X, T1.X, T0.X, 1152; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1153; EG-NEXT: AND_INT T1.W, PS, 1, 1154; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1155; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1156; EG-NEXT: LSHL T0.X, PV.W, PS, 1157; EG-NEXT: LSHL * T0.W, literal.x, PS, 1158; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1159; EG-NEXT: MOV T0.Y, 0.0, 1160; EG-NEXT: MOV * T0.Z, 0.0, 1161; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1162; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1163entry: 1164 %mul = mul i1 %a, %b 1165 store i1 %mul, ptr addrspace(1) %out, align 4 1166 ret void 1167} 1168 1169define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1170; SI-LABEL: v_mul_i1: 1171; SI: ; %bb.0: ; %entry 1172; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1173; SI-NEXT: s_mov_b32 s7, 0xf000 1174; SI-NEXT: s_mov_b32 s6, -1 1175; SI-NEXT: s_mov_b32 s10, s6 1176; SI-NEXT: s_mov_b32 s11, s7 1177; SI-NEXT: s_waitcnt lgkmcnt(0) 1178; SI-NEXT: s_mov_b32 s8, s2 1179; SI-NEXT: s_mov_b32 s9, s3 1180; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1181; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1182; SI-NEXT: s_mov_b32 s4, s0 1183; SI-NEXT: s_mov_b32 s5, s1 1184; SI-NEXT: s_waitcnt vmcnt(0) 1185; SI-NEXT: v_mul_lo_u32 v0, v0, v1 1186; SI-NEXT: v_and_b32_e32 v0, 1, v0 1187; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1188; SI-NEXT: s_endpgm 1189; 1190; VI-LABEL: v_mul_i1: 1191; VI: ; %bb.0: ; %entry 1192; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1193; VI-NEXT: s_mov_b32 s7, 0xf000 1194; VI-NEXT: s_mov_b32 s6, -1 1195; VI-NEXT: s_mov_b32 s10, s6 1196; VI-NEXT: s_mov_b32 s11, s7 1197; VI-NEXT: s_waitcnt lgkmcnt(0) 1198; VI-NEXT: s_mov_b32 s8, s2 1199; VI-NEXT: s_mov_b32 s9, s3 1200; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1201; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1202; VI-NEXT: s_mov_b32 s4, s0 1203; VI-NEXT: s_mov_b32 s5, s1 1204; VI-NEXT: s_waitcnt vmcnt(0) 1205; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1 1206; VI-NEXT: v_and_b32_e32 v0, 1, v0 1207; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1208; VI-NEXT: s_endpgm 1209; 1210; GFX9-LABEL: v_mul_i1: 1211; GFX9: ; %bb.0: ; %entry 1212; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1213; GFX9-NEXT: s_mov_b32 s7, 0xf000 1214; GFX9-NEXT: s_mov_b32 s6, -1 1215; GFX9-NEXT: s_mov_b32 s10, s6 1216; GFX9-NEXT: s_mov_b32 s11, s7 1217; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1218; GFX9-NEXT: s_mov_b32 s8, s2 1219; GFX9-NEXT: s_mov_b32 s9, s3 1220; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1221; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1222; GFX9-NEXT: s_mov_b32 s4, s0 1223; GFX9-NEXT: s_mov_b32 s5, s1 1224; GFX9-NEXT: s_waitcnt vmcnt(0) 1225; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 1226; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 1227; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0 1228; GFX9-NEXT: s_endpgm 1229; 1230; GFX10-LABEL: v_mul_i1: 1231; GFX10: ; %bb.0: ; %entry 1232; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1233; GFX10-NEXT: s_mov_b32 s6, -1 1234; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1235; GFX10-NEXT: s_mov_b32 s10, s6 1236; GFX10-NEXT: s_mov_b32 s11, s7 1237; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX10-NEXT: s_mov_b32 s8, s2 1239; GFX10-NEXT: s_mov_b32 s9, s3 1240; GFX10-NEXT: s_clause 0x1 1241; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1242; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4 1243; GFX10-NEXT: s_mov_b32 s4, s0 1244; GFX10-NEXT: s_mov_b32 s5, s1 1245; GFX10-NEXT: s_waitcnt vmcnt(0) 1246; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 1247; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 1248; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0 1249; GFX10-NEXT: s_endpgm 1250; 1251; GFX11-LABEL: v_mul_i1: 1252; GFX11: ; %bb.0: ; %entry 1253; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1254; GFX11-NEXT: s_mov_b32 s6, -1 1255; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1256; GFX11-NEXT: s_mov_b32 s10, s6 1257; GFX11-NEXT: s_mov_b32 s11, s7 1258; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX11-NEXT: s_mov_b32 s8, s2 1260; GFX11-NEXT: s_mov_b32 s9, s3 1261; GFX11-NEXT: s_clause 0x1 1262; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0 1263; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4 1264; GFX11-NEXT: s_mov_b32 s4, s0 1265; GFX11-NEXT: s_mov_b32 s5, s1 1266; GFX11-NEXT: s_waitcnt vmcnt(0) 1267; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1 1268; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1269; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 1270; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0 1271; GFX11-NEXT: s_nop 0 1272; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1273; GFX11-NEXT: s_endpgm 1274; 1275; EG-LABEL: v_mul_i1: 1276; EG: ; %bb.0: ; %entry 1277; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1278; EG-NEXT: TEX 1 @6 1279; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 1280; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1281; EG-NEXT: CF_END 1282; EG-NEXT: PAD 1283; EG-NEXT: Fetch clause starting at 6: 1284; EG-NEXT: VTX_READ_8 T1.X, T0.X, 4, #1 1285; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1286; EG-NEXT: ALU clause starting at 10: 1287; EG-NEXT: MOV * T0.X, KC0[2].Z, 1288; EG-NEXT: ALU clause starting at 11: 1289; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 1290; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 1291; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1292; EG-NEXT: AND_INT T1.W, PS, 1, 1293; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1294; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1295; EG-NEXT: LSHL T0.X, PV.W, PS, 1296; EG-NEXT: LSHL * T0.W, literal.x, PS, 1297; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1298; EG-NEXT: MOV T0.Y, 0.0, 1299; EG-NEXT: MOV * T0.Z, 0.0, 1300; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1301; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1302entry: 1303 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1304 %a = load i1, ptr addrspace(1) %in 1305 %b = load i1, ptr addrspace(1) %b_ptr 1306 %result = mul i1 %a, %b 1307 store i1 %result, ptr addrspace(1) %out 1308 ret void 1309} 1310 1311; A standard 64-bit multiply. The expansion should be around 6 instructions. 1312; It would be difficult to match the expansion correctly without writing 1313; a really complicated list of FileCheck expressions. I don't want 1314; to confuse people who may 'break' this test with a correct optimization, 1315; so this test just uses FUNC-LABEL to make sure the compiler does not 1316; crash with a 'failed to select' error. 1317 1318define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 1319; SI-LABEL: s_mul_i64: 1320; SI: ; %bb.0: ; %entry 1321; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1322; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1323; SI-NEXT: s_mov_b32 s3, 0xf000 1324; SI-NEXT: s_mov_b32 s2, -1 1325; SI-NEXT: s_waitcnt lgkmcnt(0) 1326; SI-NEXT: s_mov_b32 s0, s4 1327; SI-NEXT: v_mov_b32_e32 v0, s8 1328; SI-NEXT: v_mul_hi_u32 v0, s6, v0 1329; SI-NEXT: s_mul_i32 s4, s6, s9 1330; SI-NEXT: s_mov_b32 s1, s5 1331; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 1332; SI-NEXT: s_mul_i32 s4, s7, s8 1333; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v0 1334; SI-NEXT: s_mul_i32 s4, s6, s8 1335; SI-NEXT: v_mov_b32_e32 v0, s4 1336; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1337; SI-NEXT: s_endpgm 1338; 1339; VI-LABEL: s_mul_i64: 1340; VI: ; %bb.0: ; %entry 1341; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1342; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1343; VI-NEXT: s_mov_b32 s3, 0xf000 1344; VI-NEXT: s_mov_b32 s2, -1 1345; VI-NEXT: s_waitcnt lgkmcnt(0) 1346; VI-NEXT: s_mov_b32 s0, s4 1347; VI-NEXT: v_mov_b32_e32 v0, s8 1348; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s6, v0, 0 1349; VI-NEXT: s_mul_i32 s4, s6, s9 1350; VI-NEXT: s_mov_b32 s1, s5 1351; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1352; VI-NEXT: s_mul_i32 s4, s7, s8 1353; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1354; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1355; VI-NEXT: s_endpgm 1356; 1357; GFX9-LABEL: s_mul_i64: 1358; GFX9: ; %bb.0: ; %entry 1359; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1360; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1361; GFX9-NEXT: s_mov_b32 s3, 0xf000 1362; GFX9-NEXT: s_mov_b32 s2, -1 1363; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1364; GFX9-NEXT: s_mov_b32 s0, s4 1365; GFX9-NEXT: s_mov_b32 s1, s5 1366; GFX9-NEXT: s_mul_i32 s4, s6, s9 1367; GFX9-NEXT: s_mul_hi_u32 s5, s6, s8 1368; GFX9-NEXT: s_add_i32 s4, s5, s4 1369; GFX9-NEXT: s_mul_i32 s5, s7, s8 1370; GFX9-NEXT: s_add_i32 s4, s4, s5 1371; GFX9-NEXT: s_mul_i32 s5, s6, s8 1372; GFX9-NEXT: v_mov_b32_e32 v0, s5 1373; GFX9-NEXT: v_mov_b32_e32 v1, s4 1374; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1375; GFX9-NEXT: s_endpgm 1376; 1377; GFX10-LABEL: s_mul_i64: 1378; GFX10: ; %bb.0: ; %entry 1379; GFX10-NEXT: s_clause 0x1 1380; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1381; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1382; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1383; GFX10-NEXT: s_mul_i32 s0, s6, s3 1384; GFX10-NEXT: s_mul_hi_u32 s1, s6, s2 1385; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1386; GFX10-NEXT: s_add_i32 s0, s1, s0 1387; GFX10-NEXT: s_mul_i32 s1, s7, s2 1388; GFX10-NEXT: s_mul_i32 s2, s6, s2 1389; GFX10-NEXT: s_add_i32 s0, s0, s1 1390; GFX10-NEXT: v_mov_b32_e32 v0, s2 1391; GFX10-NEXT: v_mov_b32_e32 v1, s0 1392; GFX10-NEXT: s_mov_b32 s2, -1 1393; GFX10-NEXT: s_mov_b32 s0, s4 1394; GFX10-NEXT: s_mov_b32 s1, s5 1395; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1396; GFX10-NEXT: s_endpgm 1397; 1398; GFX11-LABEL: s_mul_i64: 1399; GFX11: ; %bb.0: ; %entry 1400; GFX11-NEXT: s_clause 0x1 1401; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1402; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1403; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1404; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1405; GFX11-NEXT: s_mul_i32 s1, s6, s1 1406; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0 1407; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 1408; GFX11-NEXT: s_add_i32 s1, s2, s1 1409; GFX11-NEXT: s_mul_i32 s2, s7, s0 1410; GFX11-NEXT: s_mul_i32 s0, s6, s0 1411; GFX11-NEXT: s_add_i32 s1, s1, s2 1412; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 1413; GFX11-NEXT: s_mov_b32 s2, -1 1414; GFX11-NEXT: s_mov_b32 s0, s4 1415; GFX11-NEXT: s_mov_b32 s1, s5 1416; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 1417; GFX11-NEXT: s_nop 0 1418; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1419; GFX11-NEXT: s_endpgm 1420; 1421; EG-LABEL: s_mul_i64: 1422; EG: ; %bb.0: ; %entry 1423; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1424; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1425; EG-NEXT: CF_END 1426; EG-NEXT: PAD 1427; EG-NEXT: ALU clause starting at 4: 1428; EG-NEXT: MULHI * T0.X, KC0[2].W, KC0[3].Y, 1429; EG-NEXT: MULLO_INT * T0.Y, KC0[2].W, KC0[3].Z, 1430; EG-NEXT: ADD_INT T0.W, T0.X, PS, 1431; EG-NEXT: MULLO_INT * T0.X, KC0[3].X, KC0[3].Y, 1432; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1433; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1434; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1435; EG-NEXT: MULLO_INT * T0.X, KC0[2].W, KC0[3].Y, 1436entry: 1437 %mul = mul i64 %a, %b 1438 store i64 %mul, ptr addrspace(1) %out, align 8 1439 ret void 1440} 1441 1442define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { 1443; SI-LABEL: v_mul_i64: 1444; SI: ; %bb.0: ; %entry 1445; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 1446; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd 1447; SI-NEXT: s_mov_b32 s3, 0xf000 1448; SI-NEXT: s_mov_b32 s2, -1 1449; SI-NEXT: s_mov_b32 s10, s2 1450; SI-NEXT: s_mov_b32 s11, s3 1451; SI-NEXT: s_waitcnt lgkmcnt(0) 1452; SI-NEXT: s_mov_b32 s12, s6 1453; SI-NEXT: s_mov_b32 s13, s7 1454; SI-NEXT: s_mov_b32 s14, s2 1455; SI-NEXT: s_mov_b32 s15, s3 1456; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1457; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1458; SI-NEXT: s_mov_b32 s0, s4 1459; SI-NEXT: s_mov_b32 s1, s5 1460; SI-NEXT: s_waitcnt vmcnt(0) 1461; SI-NEXT: v_mul_lo_u32 v1, v2, v1 1462; SI-NEXT: v_mul_hi_u32 v4, v2, v0 1463; SI-NEXT: v_mul_lo_u32 v3, v3, v0 1464; SI-NEXT: v_mul_lo_u32 v0, v2, v0 1465; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 1466; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1467; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1468; SI-NEXT: s_endpgm 1469; 1470; VI-LABEL: v_mul_i64: 1471; VI: ; %bb.0: ; %entry 1472; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1473; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1474; VI-NEXT: s_mov_b32 s3, 0xf000 1475; VI-NEXT: s_mov_b32 s2, -1 1476; VI-NEXT: s_mov_b32 s10, s2 1477; VI-NEXT: s_mov_b32 s11, s3 1478; VI-NEXT: s_waitcnt lgkmcnt(0) 1479; VI-NEXT: s_mov_b32 s12, s6 1480; VI-NEXT: s_mov_b32 s13, s7 1481; VI-NEXT: s_mov_b32 s14, s2 1482; VI-NEXT: s_mov_b32 s15, s3 1483; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1484; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1485; VI-NEXT: s_mov_b32 s0, s4 1486; VI-NEXT: s_mov_b32 s1, s5 1487; VI-NEXT: s_waitcnt vmcnt(0) 1488; VI-NEXT: v_mul_lo_u32 v4, v2, v1 1489; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v2, v0, 0 1490; VI-NEXT: v_mul_lo_u32 v0, v3, v0 1491; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 1492; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 1493; VI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 1494; VI-NEXT: s_endpgm 1495; 1496; GFX9-LABEL: v_mul_i64: 1497; GFX9: ; %bb.0: ; %entry 1498; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1499; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1500; GFX9-NEXT: s_mov_b32 s3, 0xf000 1501; GFX9-NEXT: s_mov_b32 s2, -1 1502; GFX9-NEXT: s_mov_b32 s10, s2 1503; GFX9-NEXT: s_mov_b32 s11, s3 1504; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1505; GFX9-NEXT: s_mov_b32 s12, s6 1506; GFX9-NEXT: s_mov_b32 s13, s7 1507; GFX9-NEXT: s_mov_b32 s14, s2 1508; GFX9-NEXT: s_mov_b32 s15, s3 1509; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1510; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1511; GFX9-NEXT: s_mov_b32 s0, s4 1512; GFX9-NEXT: s_mov_b32 s1, s5 1513; GFX9-NEXT: s_waitcnt vmcnt(0) 1514; GFX9-NEXT: v_mul_lo_u32 v1, v2, v1 1515; GFX9-NEXT: v_mul_hi_u32 v4, v2, v0 1516; GFX9-NEXT: v_mul_lo_u32 v3, v3, v0 1517; GFX9-NEXT: v_mul_lo_u32 v0, v2, v0 1518; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 1519; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1520; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1521; GFX9-NEXT: s_endpgm 1522; 1523; GFX10-LABEL: v_mul_i64: 1524; GFX10: ; %bb.0: ; %entry 1525; GFX10-NEXT: s_clause 0x1 1526; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 1527; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 1528; GFX10-NEXT: s_mov_b32 s2, -1 1529; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1530; GFX10-NEXT: s_mov_b32 s10, s2 1531; GFX10-NEXT: s_mov_b32 s11, s3 1532; GFX10-NEXT: s_mov_b32 s14, s2 1533; GFX10-NEXT: s_mov_b32 s15, s3 1534; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1535; GFX10-NEXT: s_mov_b32 s12, s6 1536; GFX10-NEXT: s_mov_b32 s13, s7 1537; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1538; GFX10-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 1539; GFX10-NEXT: s_mov_b32 s0, s4 1540; GFX10-NEXT: s_mov_b32 s1, s5 1541; GFX10-NEXT: s_waitcnt vmcnt(0) 1542; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1 1543; GFX10-NEXT: v_mul_hi_u32 v4, v2, v0 1544; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 1545; GFX10-NEXT: v_mul_lo_u32 v0, v2, v0 1546; GFX10-NEXT: v_add_nc_u32_e32 v1, v4, v1 1547; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 1548; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1549; GFX10-NEXT: s_endpgm 1550; 1551; GFX11-LABEL: v_mul_i64: 1552; GFX11: ; %bb.0: ; %entry 1553; GFX11-NEXT: s_clause 0x1 1554; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 1555; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 1556; GFX11-NEXT: s_mov_b32 s10, -1 1557; GFX11-NEXT: s_mov_b32 s11, 0x31016000 1558; GFX11-NEXT: s_mov_b32 s2, s10 1559; GFX11-NEXT: s_mov_b32 s3, s11 1560; GFX11-NEXT: s_mov_b32 s14, s10 1561; GFX11-NEXT: s_mov_b32 s15, s11 1562; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1563; GFX11-NEXT: s_mov_b32 s12, s6 1564; GFX11-NEXT: s_mov_b32 s13, s7 1565; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 1566; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0 1567; GFX11-NEXT: s_mov_b32 s8, s4 1568; GFX11-NEXT: s_mov_b32 s9, s5 1569; GFX11-NEXT: s_waitcnt vmcnt(0) 1570; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1 1571; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0 1572; GFX11-NEXT: v_mul_lo_u32 v3, v3, v0 1573; GFX11-NEXT: v_mul_lo_u32 v0, v2, v0 1574; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 1575; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1 1576; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 1577; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0 1578; GFX11-NEXT: s_nop 0 1579; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1580; GFX11-NEXT: s_endpgm 1581; 1582; EG-LABEL: v_mul_i64: 1583; EG: ; %bb.0: ; %entry 1584; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 1585; EG-NEXT: TEX 1 @6 1586; EG-NEXT: ALU 7, @12, KC0[CB0:0-32], KC1[] 1587; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T2.X, 1 1588; EG-NEXT: CF_END 1589; EG-NEXT: PAD 1590; EG-NEXT: Fetch clause starting at 6: 1591; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 1592; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1593; EG-NEXT: ALU clause starting at 10: 1594; EG-NEXT: MOV T0.X, KC0[2].Z, 1595; EG-NEXT: MOV * T1.X, KC0[2].W, 1596; EG-NEXT: ALU clause starting at 12: 1597; EG-NEXT: MULHI * T0.Z, T0.X, T1.X, 1598; EG-NEXT: MULLO_INT * T0.W, T0.X, T1.Y, 1599; EG-NEXT: ADD_INT T0.W, T0.Z, PS, 1600; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T1.X, 1601; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 1602; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, 1603; EG-NEXT: MULLO_INT * T0.X, T0.X, T1.X, 1604; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1605entry: 1606 %a = load i64, ptr addrspace(1) %aptr, align 8 1607 %b = load i64, ptr addrspace(1) %bptr, align 8 1608 %mul = mul i64 %a, %b 1609 store i64 %mul, ptr addrspace(1) %out, align 8 1610 ret void 1611} 1612 1613define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b, i32 %c) { 1614; SI-LABEL: mul32_in_branch: 1615; SI: ; %bb.0: ; %entry 1616; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 1617; SI-NEXT: s_waitcnt lgkmcnt(0) 1618; SI-NEXT: s_cmp_lg_u32 s2, 0 1619; SI-NEXT: s_cbranch_scc0 .LBB13_2 1620; SI-NEXT: ; %bb.1: ; %else 1621; SI-NEXT: s_mul_i32 s6, s2, s3 1622; SI-NEXT: s_mov_b64 s[4:5], 0 1623; SI-NEXT: s_branch .LBB13_3 1624; SI-NEXT: .LBB13_2: 1625; SI-NEXT: s_mov_b64 s[4:5], -1 1626; SI-NEXT: ; implicit-def: $sgpr6 1627; SI-NEXT: .LBB13_3: ; %Flow 1628; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 1629; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1630; SI-NEXT: s_waitcnt lgkmcnt(0) 1631; SI-NEXT: s_mov_b64 vcc, vcc 1632; SI-NEXT: s_cbranch_vccnz .LBB13_5 1633; SI-NEXT: ; %bb.4: ; %if 1634; SI-NEXT: s_mov_b32 s7, 0xf000 1635; SI-NEXT: s_mov_b32 s6, -1 1636; SI-NEXT: s_mov_b32 s4, s2 1637; SI-NEXT: s_mov_b32 s5, s3 1638; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1639; SI-NEXT: s_branch .LBB13_6 1640; SI-NEXT: .LBB13_5: 1641; SI-NEXT: v_mov_b32_e32 v0, s6 1642; SI-NEXT: .LBB13_6: ; %endif 1643; SI-NEXT: s_mov_b32 s3, 0xf000 1644; SI-NEXT: s_mov_b32 s2, -1 1645; SI-NEXT: s_waitcnt vmcnt(0) 1646; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1647; SI-NEXT: s_endpgm 1648; 1649; VI-LABEL: mul32_in_branch: 1650; VI: ; %bb.0: ; %entry 1651; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1652; VI-NEXT: s_waitcnt lgkmcnt(0) 1653; VI-NEXT: s_cmp_lg_u32 s2, 0 1654; VI-NEXT: s_cbranch_scc0 .LBB13_2 1655; VI-NEXT: ; %bb.1: ; %else 1656; VI-NEXT: s_mul_i32 s6, s2, s3 1657; VI-NEXT: s_mov_b64 s[4:5], 0 1658; VI-NEXT: s_branch .LBB13_3 1659; VI-NEXT: .LBB13_2: 1660; VI-NEXT: s_mov_b64 s[4:5], -1 1661; VI-NEXT: ; implicit-def: $sgpr6 1662; VI-NEXT: .LBB13_3: ; %Flow 1663; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1664; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1665; VI-NEXT: s_cbranch_vccnz .LBB13_5 1666; VI-NEXT: ; %bb.4: ; %if 1667; VI-NEXT: s_mov_b32 s7, 0xf000 1668; VI-NEXT: s_mov_b32 s6, -1 1669; VI-NEXT: s_waitcnt lgkmcnt(0) 1670; VI-NEXT: s_mov_b32 s4, s2 1671; VI-NEXT: s_mov_b32 s5, s3 1672; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 1673; VI-NEXT: s_branch .LBB13_6 1674; VI-NEXT: .LBB13_5: 1675; VI-NEXT: v_mov_b32_e32 v0, s6 1676; VI-NEXT: .LBB13_6: ; %endif 1677; VI-NEXT: s_waitcnt lgkmcnt(0) 1678; VI-NEXT: s_mov_b32 s3, 0xf000 1679; VI-NEXT: s_mov_b32 s2, -1 1680; VI-NEXT: s_waitcnt vmcnt(0) 1681; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1682; VI-NEXT: s_endpgm 1683; 1684; GFX9-LABEL: mul32_in_branch: 1685; GFX9: ; %bb.0: ; %entry 1686; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1687; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1688; GFX9-NEXT: s_cmp_lg_u32 s2, 0 1689; GFX9-NEXT: s_cbranch_scc0 .LBB13_2 1690; GFX9-NEXT: ; %bb.1: ; %else 1691; GFX9-NEXT: s_mul_i32 s6, s2, s3 1692; GFX9-NEXT: s_mov_b64 s[4:5], 0 1693; GFX9-NEXT: s_branch .LBB13_3 1694; GFX9-NEXT: .LBB13_2: 1695; GFX9-NEXT: s_mov_b64 s[4:5], -1 1696; GFX9-NEXT: ; implicit-def: $sgpr6 1697; GFX9-NEXT: .LBB13_3: ; %Flow 1698; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1699; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1700; GFX9-NEXT: s_cbranch_vccnz .LBB13_5 1701; GFX9-NEXT: ; %bb.4: ; %if 1702; GFX9-NEXT: s_mov_b32 s7, 0xf000 1703; GFX9-NEXT: s_mov_b32 s6, -1 1704; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1705; GFX9-NEXT: s_mov_b32 s4, s2 1706; GFX9-NEXT: s_mov_b32 s5, s3 1707; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0 1708; GFX9-NEXT: s_branch .LBB13_6 1709; GFX9-NEXT: .LBB13_5: 1710; GFX9-NEXT: v_mov_b32_e32 v0, s6 1711; GFX9-NEXT: .LBB13_6: ; %endif 1712; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX9-NEXT: s_mov_b32 s3, 0xf000 1714; GFX9-NEXT: s_mov_b32 s2, -1 1715; GFX9-NEXT: s_waitcnt vmcnt(0) 1716; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1717; GFX9-NEXT: s_endpgm 1718; 1719; GFX10-LABEL: mul32_in_branch: 1720; GFX10: ; %bb.0: ; %entry 1721; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 1722; GFX10-NEXT: s_mov_b32 s4, 0 1723; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1724; GFX10-NEXT: s_cmp_lg_u32 s2, 0 1725; GFX10-NEXT: s_cbranch_scc0 .LBB13_2 1726; GFX10-NEXT: ; %bb.1: ; %else 1727; GFX10-NEXT: s_mul_i32 s5, s2, s3 1728; GFX10-NEXT: s_branch .LBB13_3 1729; GFX10-NEXT: .LBB13_2: 1730; GFX10-NEXT: s_mov_b32 s4, -1 1731; GFX10-NEXT: ; implicit-def: $sgpr5 1732; GFX10-NEXT: .LBB13_3: ; %Flow 1733; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 1734; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 1735; GFX10-NEXT: s_cbranch_vccnz .LBB13_5 1736; GFX10-NEXT: ; %bb.4: ; %if 1737; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1738; GFX10-NEXT: s_mov_b32 s6, -1 1739; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1740; GFX10-NEXT: s_mov_b32 s4, s2 1741; GFX10-NEXT: s_mov_b32 s5, s3 1742; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0 1743; GFX10-NEXT: s_branch .LBB13_6 1744; GFX10-NEXT: .LBB13_5: 1745; GFX10-NEXT: v_mov_b32_e32 v0, s5 1746; GFX10-NEXT: .LBB13_6: ; %endif 1747; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1748; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1749; GFX10-NEXT: s_mov_b32 s2, -1 1750; GFX10-NEXT: s_waitcnt vmcnt(0) 1751; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 1752; GFX10-NEXT: s_endpgm 1753; 1754; GFX11-LABEL: mul32_in_branch: 1755; GFX11: ; %bb.0: ; %entry 1756; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 1757; GFX11-NEXT: s_mov_b32 s4, 0 1758; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1759; GFX11-NEXT: s_cmp_lg_u32 s2, 0 1760; GFX11-NEXT: s_cbranch_scc0 .LBB13_2 1761; GFX11-NEXT: ; %bb.1: ; %else 1762; GFX11-NEXT: s_mul_i32 s5, s2, s3 1763; GFX11-NEXT: s_branch .LBB13_3 1764; GFX11-NEXT: .LBB13_2: 1765; GFX11-NEXT: s_mov_b32 s4, -1 1766; GFX11-NEXT: ; implicit-def: $sgpr5 1767; GFX11-NEXT: .LBB13_3: ; %Flow 1768; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 1769; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 1770; GFX11-NEXT: s_cbranch_vccnz .LBB13_5 1771; GFX11-NEXT: ; %bb.4: ; %if 1772; GFX11-NEXT: s_mov_b32 s7, 0x31016000 1773; GFX11-NEXT: s_mov_b32 s6, -1 1774; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1775; GFX11-NEXT: s_mov_b32 s4, s2 1776; GFX11-NEXT: s_mov_b32 s5, s3 1777; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 1778; GFX11-NEXT: s_branch .LBB13_6 1779; GFX11-NEXT: .LBB13_5: 1780; GFX11-NEXT: v_mov_b32_e32 v0, s5 1781; GFX11-NEXT: .LBB13_6: ; %endif 1782; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1783; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1784; GFX11-NEXT: s_mov_b32 s2, -1 1785; GFX11-NEXT: s_waitcnt vmcnt(0) 1786; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 1787; GFX11-NEXT: s_nop 0 1788; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 1789; GFX11-NEXT: s_endpgm 1790; 1791; EG-LABEL: mul32_in_branch: 1792; EG: ; %bb.0: ; %entry 1793; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] 1794; EG-NEXT: JUMP @3 POP:1 1795; EG-NEXT: ALU_POP_AFTER 4, @18, KC0[CB0:0-32], KC1[] 1796; EG-NEXT: ALU_PUSH_BEFORE 2, @23, KC0[CB0:0-32], KC1[] 1797; EG-NEXT: JUMP @8 POP:1 1798; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] 1799; EG-NEXT: TEX 0 @12 1800; EG-NEXT: POP @8 POP:1 1801; EG-NEXT: ALU 1, @27, KC0[], KC1[] 1802; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1803; EG-NEXT: CF_END 1804; EG-NEXT: PAD 1805; EG-NEXT: Fetch clause starting at 12: 1806; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1807; EG-NEXT: ALU clause starting at 14: 1808; EG-NEXT: MOV T0.W, literal.x, 1809; EG-NEXT: SETNE_INT * T1.W, KC0[2].W, 0.0, 1810; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1811; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1812; EG-NEXT: ALU clause starting at 18: 1813; EG-NEXT: MOV T1.W, KC0[2].W, 1814; EG-NEXT: MOV * T2.W, KC0[3].X, 1815; EG-NEXT: MOV T0.W, literal.x, 1816; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 1817; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1818; EG-NEXT: ALU clause starting at 23: 1819; EG-NEXT: MOV T1.W, KC0[2].Y, 1820; EG-NEXT: SETE_INT * T0.W, T0.W, 0.0, 1821; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1822; EG-NEXT: ALU clause starting at 26: 1823; EG-NEXT: MOV * T0.X, KC0[2].Z, 1824; EG-NEXT: ALU clause starting at 27: 1825; EG-NEXT: LSHR * T1.X, T1.W, literal.x, 1826; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1827entry: 1828 %0 = icmp eq i32 %a, 0 1829 br i1 %0, label %if, label %else 1830 1831if: 1832 %1 = load i32, ptr addrspace(1) %in 1833 br label %endif 1834 1835else: 1836 %2 = mul i32 %a, %b 1837 br label %endif 1838 1839endif: 1840 %3 = phi i32 [%1, %if], [%2, %else] 1841 store i32 %3, ptr addrspace(1) %out 1842 ret void 1843} 1844 1845define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { 1846; SI-LABEL: mul64_in_branch: 1847; SI: ; %bb.0: ; %entry 1848; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 1849; SI-NEXT: s_mov_b64 s[8:9], 0 1850; SI-NEXT: s_waitcnt lgkmcnt(0) 1851; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 1852; SI-NEXT: s_and_b64 vcc, exec, s[10:11] 1853; SI-NEXT: s_cbranch_vccz .LBB14_4 1854; SI-NEXT: ; %bb.1: ; %else 1855; SI-NEXT: v_mov_b32_e32 v0, s6 1856; SI-NEXT: v_mul_hi_u32 v0, s4, v0 1857; SI-NEXT: s_mul_i32 s7, s4, s7 1858; SI-NEXT: s_mul_i32 s5, s5, s6 1859; SI-NEXT: s_mul_i32 s4, s4, s6 1860; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 1861; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 1862; SI-NEXT: v_mov_b32_e32 v0, s4 1863; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1864; SI-NEXT: s_cbranch_vccnz .LBB14_3 1865; SI-NEXT: .LBB14_2: ; %if 1866; SI-NEXT: s_mov_b32 s7, 0xf000 1867; SI-NEXT: s_mov_b32 s6, -1 1868; SI-NEXT: s_mov_b32 s4, s2 1869; SI-NEXT: s_mov_b32 s5, s3 1870; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1871; SI-NEXT: .LBB14_3: ; %endif 1872; SI-NEXT: s_mov_b32 s3, 0xf000 1873; SI-NEXT: s_mov_b32 s2, -1 1874; SI-NEXT: s_waitcnt vmcnt(0) 1875; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1876; SI-NEXT: s_endpgm 1877; SI-NEXT: .LBB14_4: 1878; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 1879; SI-NEXT: s_branch .LBB14_2 1880; 1881; VI-LABEL: mul64_in_branch: 1882; VI: ; %bb.0: ; %entry 1883; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1884; VI-NEXT: s_mov_b64 s[8:9], 0 1885; VI-NEXT: s_waitcnt lgkmcnt(0) 1886; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 1887; VI-NEXT: s_cbranch_scc0 .LBB14_4 1888; VI-NEXT: ; %bb.1: ; %else 1889; VI-NEXT: v_mov_b32_e32 v0, s6 1890; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 1891; VI-NEXT: s_mul_i32 s4, s4, s7 1892; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1893; VI-NEXT: s_mul_i32 s4, s5, s6 1894; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 1895; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1896; VI-NEXT: s_cbranch_vccnz .LBB14_3 1897; VI-NEXT: .LBB14_2: ; %if 1898; VI-NEXT: s_mov_b32 s7, 0xf000 1899; VI-NEXT: s_mov_b32 s6, -1 1900; VI-NEXT: s_mov_b32 s4, s2 1901; VI-NEXT: s_mov_b32 s5, s3 1902; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1903; VI-NEXT: .LBB14_3: ; %endif 1904; VI-NEXT: s_mov_b32 s3, 0xf000 1905; VI-NEXT: s_mov_b32 s2, -1 1906; VI-NEXT: s_waitcnt vmcnt(0) 1907; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1908; VI-NEXT: s_endpgm 1909; VI-NEXT: .LBB14_4: 1910; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 1911; VI-NEXT: s_branch .LBB14_2 1912; 1913; GFX9-LABEL: mul64_in_branch: 1914; GFX9: ; %bb.0: ; %entry 1915; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1916; GFX9-NEXT: s_mov_b64 s[8:9], 0 1917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1918; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 1919; GFX9-NEXT: s_cbranch_scc0 .LBB14_3 1920; GFX9-NEXT: ; %bb.1: ; %else 1921; GFX9-NEXT: s_mul_i32 s7, s4, s7 1922; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 1923; GFX9-NEXT: s_add_i32 s7, s10, s7 1924; GFX9-NEXT: s_mul_i32 s5, s5, s6 1925; GFX9-NEXT: s_add_i32 s5, s7, s5 1926; GFX9-NEXT: s_mul_i32 s4, s4, s6 1927; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1928; GFX9-NEXT: s_cbranch_vccnz .LBB14_4 1929; GFX9-NEXT: .LBB14_2: ; %if 1930; GFX9-NEXT: s_mov_b32 s7, 0xf000 1931; GFX9-NEXT: s_mov_b32 s6, -1 1932; GFX9-NEXT: s_mov_b32 s4, s2 1933; GFX9-NEXT: s_mov_b32 s5, s3 1934; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1935; GFX9-NEXT: s_branch .LBB14_5 1936; GFX9-NEXT: .LBB14_3: 1937; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 1938; GFX9-NEXT: s_branch .LBB14_2 1939; GFX9-NEXT: .LBB14_4: 1940; GFX9-NEXT: v_mov_b32_e32 v0, s4 1941; GFX9-NEXT: v_mov_b32_e32 v1, s5 1942; GFX9-NEXT: .LBB14_5: ; %endif 1943; GFX9-NEXT: s_mov_b32 s3, 0xf000 1944; GFX9-NEXT: s_mov_b32 s2, -1 1945; GFX9-NEXT: s_waitcnt vmcnt(0) 1946; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1947; GFX9-NEXT: s_endpgm 1948; 1949; GFX10-LABEL: mul64_in_branch: 1950; GFX10: ; %bb.0: ; %entry 1951; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 1952; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1953; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 1954; GFX10-NEXT: s_cbranch_scc0 .LBB14_3 1955; GFX10-NEXT: ; %bb.1: ; %else 1956; GFX10-NEXT: s_mul_i32 s7, s4, s7 1957; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 1958; GFX10-NEXT: s_mul_i32 s5, s5, s6 1959; GFX10-NEXT: s_add_i32 s7, s8, s7 1960; GFX10-NEXT: s_mul_i32 s4, s4, s6 1961; GFX10-NEXT: s_add_i32 s5, s7, s5 1962; GFX10-NEXT: s_mov_b32 s6, 0 1963; GFX10-NEXT: s_cbranch_execnz .LBB14_4 1964; GFX10-NEXT: .LBB14_2: ; %if 1965; GFX10-NEXT: s_mov_b32 s7, 0x31016000 1966; GFX10-NEXT: s_mov_b32 s6, -1 1967; GFX10-NEXT: s_mov_b32 s4, s2 1968; GFX10-NEXT: s_mov_b32 s5, s3 1969; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 1970; GFX10-NEXT: s_branch .LBB14_5 1971; GFX10-NEXT: .LBB14_3: 1972; GFX10-NEXT: s_mov_b32 s6, -1 1973; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 1974; GFX10-NEXT: s_branch .LBB14_2 1975; GFX10-NEXT: .LBB14_4: 1976; GFX10-NEXT: v_mov_b32_e32 v0, s4 1977; GFX10-NEXT: v_mov_b32_e32 v1, s5 1978; GFX10-NEXT: .LBB14_5: ; %endif 1979; GFX10-NEXT: s_mov_b32 s3, 0x31016000 1980; GFX10-NEXT: s_mov_b32 s2, -1 1981; GFX10-NEXT: s_waitcnt vmcnt(0) 1982; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1983; GFX10-NEXT: s_endpgm 1984; 1985; GFX11-LABEL: mul64_in_branch: 1986; GFX11: ; %bb.0: ; %entry 1987; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 1988; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 1990; GFX11-NEXT: s_cbranch_scc0 .LBB14_3 1991; GFX11-NEXT: ; %bb.1: ; %else 1992; GFX11-NEXT: s_mul_i32 s7, s4, s7 1993; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 1994; GFX11-NEXT: s_mul_i32 s5, s5, s6 1995; GFX11-NEXT: s_add_i32 s7, s8, s7 1996; GFX11-NEXT: s_mul_i32 s4, s4, s6 1997; GFX11-NEXT: s_add_i32 s5, s7, s5 1998; GFX11-NEXT: s_mov_b32 s6, 0 1999; GFX11-NEXT: s_cbranch_execnz .LBB14_4 2000; GFX11-NEXT: .LBB14_2: ; %if 2001; GFX11-NEXT: s_mov_b32 s7, 0x31016000 2002; GFX11-NEXT: s_mov_b32 s6, -1 2003; GFX11-NEXT: s_mov_b32 s4, s2 2004; GFX11-NEXT: s_mov_b32 s5, s3 2005; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 2006; GFX11-NEXT: s_branch .LBB14_5 2007; GFX11-NEXT: .LBB14_3: 2008; GFX11-NEXT: s_mov_b32 s6, -1 2009; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 2010; GFX11-NEXT: s_branch .LBB14_2 2011; GFX11-NEXT: .LBB14_4: 2012; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 2013; GFX11-NEXT: .LBB14_5: ; %endif 2014; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2015; GFX11-NEXT: s_mov_b32 s2, -1 2016; GFX11-NEXT: s_waitcnt vmcnt(0) 2017; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 2018; GFX11-NEXT: s_nop 0 2019; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2020; GFX11-NEXT: s_endpgm 2021; 2022; EG-LABEL: mul64_in_branch: 2023; EG: ; %bb.0: ; %entry 2024; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] 2025; EG-NEXT: JUMP @3 POP:1 2026; EG-NEXT: ALU_POP_AFTER 11, @19, KC0[CB0:0-32], KC1[] 2027; EG-NEXT: ALU_PUSH_BEFORE 2, @31, KC0[CB0:0-32], KC1[] 2028; EG-NEXT: JUMP @8 POP:1 2029; EG-NEXT: ALU 0, @34, KC0[CB0:0-32], KC1[] 2030; EG-NEXT: TEX 0 @12 2031; EG-NEXT: POP @8 POP:1 2032; EG-NEXT: ALU 1, @35, KC0[], KC1[] 2033; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2034; EG-NEXT: CF_END 2035; EG-NEXT: PAD 2036; EG-NEXT: Fetch clause starting at 12: 2037; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 2038; EG-NEXT: ALU clause starting at 14: 2039; EG-NEXT: OR_INT T0.W, KC0[2].W, KC0[3].X, 2040; EG-NEXT: MOV * T1.W, literal.x, 2041; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 2042; EG-NEXT: SETNE_INT * T0.W, PV.W, 0.0, 2043; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0, 2044; EG-NEXT: ALU clause starting at 19: 2045; EG-NEXT: MOV T0.W, KC0[2].W, 2046; EG-NEXT: MOV * T1.W, KC0[3].Z, 2047; EG-NEXT: MOV T2.W, KC0[3].Y, 2048; EG-NEXT: MULLO_INT * T0.X, PV.W, PS, 2049; EG-NEXT: MOV T1.W, KC0[3].X, 2050; EG-NEXT: MULHI * T0.Y, T0.W, PV.W, 2051; EG-NEXT: ADD_INT T3.W, PS, T0.X, 2052; EG-NEXT: MULLO_INT * T0.X, PV.W, T2.W, 2053; EG-NEXT: ADD_INT T0.Y, PV.W, PS, 2054; EG-NEXT: MOV T1.W, literal.x, 2055; EG-NEXT: MULLO_INT * T0.X, T0.W, T2.W, 2056; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2057; EG-NEXT: ALU clause starting at 31: 2058; EG-NEXT: MOV T0.W, KC0[2].Y, 2059; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 2060; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 2061; EG-NEXT: ALU clause starting at 34: 2062; EG-NEXT: MOV * T0.X, KC0[2].Z, 2063; EG-NEXT: ALU clause starting at 35: 2064; EG-NEXT: LSHR * T1.X, T0.W, literal.x, 2065; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2066entry: 2067 %0 = icmp eq i64 %a, 0 2068 br i1 %0, label %if, label %else 2069 2070if: 2071 %1 = load i64, ptr addrspace(1) %in 2072 br label %endif 2073 2074else: 2075 %2 = mul i64 %a, %b 2076 br label %endif 2077 2078endif: 2079 %3 = phi i64 [%1, %if], [%2, %else] 2080 store i64 %3, ptr addrspace(1) %out 2081 ret void 2082} 2083 2084define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 { 2085; SI-LABEL: s_mul_i128: 2086; SI: ; %bb.0: ; %entry 2087; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x13 2088; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x1f 2089; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2090; SI-NEXT: s_mov_b32 s3, 0xf000 2091; SI-NEXT: s_mov_b32 s2, -1 2092; SI-NEXT: s_waitcnt lgkmcnt(0) 2093; SI-NEXT: v_mov_b32_e32 v0, s6 2094; SI-NEXT: v_mul_hi_u32 v0, s8, v0 2095; SI-NEXT: v_mov_b32_e32 v1, s4 2096; SI-NEXT: v_mul_hi_u32 v1, s10, v1 2097; SI-NEXT: s_mul_i32 s7, s8, s7 2098; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 2099; SI-NEXT: s_mul_i32 s7, s10, s5 2100; SI-NEXT: s_mul_i32 s12, s9, s6 2101; SI-NEXT: s_mul_i32 s6, s8, s6 2102; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 2103; SI-NEXT: s_mul_i32 s7, s11, s4 2104; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 2105; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v1 2106; SI-NEXT: s_mul_i32 s7, s10, s4 2107; SI-NEXT: v_mov_b32_e32 v2, s6 2108; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v2 2109; SI-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 2110; SI-NEXT: v_mov_b32_e32 v1, s8 2111; SI-NEXT: v_mul_hi_u32 v5, s4, v1 2112; SI-NEXT: v_mul_hi_u32 v1, s5, v1 2113; SI-NEXT: v_mov_b32_e32 v3, s9 2114; SI-NEXT: v_mul_hi_u32 v4, s4, v3 2115; SI-NEXT: s_mul_i32 s7, s5, s8 2116; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v5 2117; SI-NEXT: s_mul_i32 s6, s4, s9 2118; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 2119; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v5 2120; SI-NEXT: v_mul_hi_u32 v3, s5, v3 2121; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 2122; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 2123; SI-NEXT: s_mul_i32 s5, s5, s9 2124; SI-NEXT: v_addc_u32_e64 v5, s[6:7], 0, 0, vcc 2125; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4 2126; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2127; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2128; SI-NEXT: s_mul_i32 s4, s4, s8 2129; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc 2130; SI-NEXT: v_mov_b32_e32 v0, s4 2131; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2132; SI-NEXT: s_endpgm 2133; 2134; VI-LABEL: s_mul_i128: 2135; VI: ; %bb.0: ; %entry 2136; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 2137; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 2138; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2139; VI-NEXT: v_mov_b32_e32 v5, 0 2140; VI-NEXT: s_mov_b32 s3, 0xf000 2141; VI-NEXT: s_waitcnt lgkmcnt(0) 2142; VI-NEXT: v_mov_b32_e32 v0, s6 2143; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0 2144; VI-NEXT: s_mul_i32 s7, s8, s7 2145; VI-NEXT: v_mov_b32_e32 v6, s8 2146; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3 2147; VI-NEXT: s_mul_i32 s12, s9, s6 2148; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0 2149; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3 2150; VI-NEXT: v_mov_b32_e32 v4, v1 2151; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5] 2152; VI-NEXT: v_mov_b32_e32 v8, s4 2153; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3] 2154; VI-NEXT: v_mov_b32_e32 v3, v7 2155; VI-NEXT: v_mov_b32_e32 v7, v5 2156; VI-NEXT: v_mov_b32_e32 v8, s9 2157; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7] 2158; VI-NEXT: s_mul_i32 s8, s11, s4 2159; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2 2160; VI-NEXT: v_mov_b32_e32 v2, v5 2161; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2 2162; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc 2163; VI-NEXT: s_mul_i32 s8, s10, s5 2164; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3] 2165; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6 2166; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 2167; VI-NEXT: s_mov_b32 s2, -1 2168; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 2169; VI-NEXT: v_mov_b32_e32 v1, v4 2170; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2171; VI-NEXT: s_endpgm 2172; 2173; GFX9-LABEL: s_mul_i128: 2174; GFX9: ; %bb.0: ; %entry 2175; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c 2176; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c 2177; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 2178; GFX9-NEXT: s_mov_b32 s7, 0xf000 2179; GFX9-NEXT: s_mov_b32 s6, -1 2180; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2181; GFX9-NEXT: s_mul_i32 s0, s12, s11 2182; GFX9-NEXT: s_mul_hi_u32 s1, s12, s10 2183; GFX9-NEXT: s_mul_i32 s2, s14, s9 2184; GFX9-NEXT: s_mul_hi_u32 s3, s14, s8 2185; GFX9-NEXT: s_add_i32 s0, s1, s0 2186; GFX9-NEXT: s_mul_i32 s1, s13, s10 2187; GFX9-NEXT: s_add_i32 s2, s3, s2 2188; GFX9-NEXT: s_mul_i32 s3, s15, s8 2189; GFX9-NEXT: s_add_i32 s0, s0, s1 2190; GFX9-NEXT: s_mul_i32 s1, s12, s10 2191; GFX9-NEXT: s_add_i32 s2, s2, s3 2192; GFX9-NEXT: s_mul_i32 s3, s14, s8 2193; GFX9-NEXT: s_add_u32 s3, s3, s1 2194; GFX9-NEXT: s_addc_u32 s2, s2, s0 2195; GFX9-NEXT: s_mul_i32 s14, s9, s12 2196; GFX9-NEXT: s_mul_hi_u32 s15, s8, s12 2197; GFX9-NEXT: s_mul_hi_u32 s11, s9, s12 2198; GFX9-NEXT: s_add_u32 s14, s14, s15 2199; GFX9-NEXT: s_mul_i32 s1, s8, s13 2200; GFX9-NEXT: s_addc_u32 s11, s11, 0 2201; GFX9-NEXT: s_mul_hi_u32 s10, s8, s13 2202; GFX9-NEXT: s_add_u32 s1, s1, s14 2203; GFX9-NEXT: s_addc_u32 s10, s10, 0 2204; GFX9-NEXT: s_add_u32 s10, s11, s10 2205; GFX9-NEXT: s_addc_u32 s11, 0, 0 2206; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 2207; GFX9-NEXT: s_mul_i32 s9, s9, s13 2208; GFX9-NEXT: s_add_u32 s9, s9, s10 2209; GFX9-NEXT: s_addc_u32 s10, s14, s11 2210; GFX9-NEXT: s_mov_b32 s0, 0 2211; GFX9-NEXT: s_add_u32 s9, s9, s3 2212; GFX9-NEXT: s_addc_u32 s10, s10, s2 2213; GFX9-NEXT: s_mul_i32 s2, s8, s12 2214; GFX9-NEXT: s_mov_b32 s3, s0 2215; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 2216; GFX9-NEXT: v_mov_b32_e32 v0, s0 2217; GFX9-NEXT: v_mov_b32_e32 v1, s1 2218; GFX9-NEXT: v_mov_b32_e32 v2, s9 2219; GFX9-NEXT: v_mov_b32_e32 v3, s10 2220; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2221; GFX9-NEXT: s_endpgm 2222; 2223; GFX10-LABEL: s_mul_i128: 2224; GFX10: ; %bb.0: ; %entry 2225; GFX10-NEXT: s_clause 0x1 2226; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c 2227; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c 2228; GFX10-NEXT: s_mov_b32 s2, 0 2229; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2230; GFX10-NEXT: s_mov_b32 s13, s2 2231; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX10-NEXT: s_mul_i32 s3, s8, s7 2233; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6 2234; GFX10-NEXT: s_mul_i32 s14, s10, s5 2235; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4 2236; GFX10-NEXT: s_mul_i32 s12, s9, s6 2237; GFX10-NEXT: s_mul_i32 s11, s11, s4 2238; GFX10-NEXT: s_add_i32 s3, s7, s3 2239; GFX10-NEXT: s_add_i32 s7, s15, s14 2240; GFX10-NEXT: s_mul_i32 s6, s8, s6 2241; GFX10-NEXT: s_mul_i32 s10, s10, s4 2242; GFX10-NEXT: s_add_i32 s3, s3, s12 2243; GFX10-NEXT: s_add_i32 s7, s7, s11 2244; GFX10-NEXT: s_mul_i32 s19, s5, s8 2245; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8 2246; GFX10-NEXT: s_add_u32 s6, s10, s6 2247; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8 2248; GFX10-NEXT: s_addc_u32 s7, s7, s3 2249; GFX10-NEXT: s_mul_i32 s17, s4, s9 2250; GFX10-NEXT: s_add_u32 s3, s19, s20 2251; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9 2252; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9 2253; GFX10-NEXT: s_mul_i32 s5, s5, s9 2254; GFX10-NEXT: s_addc_u32 s9, s18, 0 2255; GFX10-NEXT: s_add_u32 s3, s17, s3 2256; GFX10-NEXT: s_addc_u32 s10, s16, 0 2257; GFX10-NEXT: s_mul_i32 s12, s4, s8 2258; GFX10-NEXT: s_add_u32 s4, s9, s10 2259; GFX10-NEXT: s_addc_u32 s8, 0, 0 2260; GFX10-NEXT: s_add_u32 s4, s5, s4 2261; GFX10-NEXT: s_addc_u32 s5, s21, s8 2262; GFX10-NEXT: s_add_u32 s4, s4, s6 2263; GFX10-NEXT: s_addc_u32 s5, s5, s7 2264; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2265; GFX10-NEXT: v_mov_b32_e32 v2, s4 2266; GFX10-NEXT: v_mov_b32_e32 v0, s2 2267; GFX10-NEXT: v_mov_b32_e32 v1, s3 2268; GFX10-NEXT: v_mov_b32_e32 v3, s5 2269; GFX10-NEXT: s_mov_b32 s3, 0x31016000 2270; GFX10-NEXT: s_mov_b32 s2, -1 2271; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2272; GFX10-NEXT: s_endpgm 2273; 2274; GFX11-LABEL: s_mul_i128: 2275; GFX11: ; %bb.0: ; %entry 2276; GFX11-NEXT: s_clause 0x2 2277; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c 2278; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c 2279; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 2280; GFX11-NEXT: s_mov_b32 s2, 0 2281; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2282; GFX11-NEXT: s_mov_b32 s13, s2 2283; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2284; GFX11-NEXT: s_mul_i32 s3, s8, s7 2285; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6 2286; GFX11-NEXT: s_mul_i32 s14, s10, s5 2287; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4 2288; GFX11-NEXT: s_mul_i32 s12, s9, s6 2289; GFX11-NEXT: s_mul_i32 s11, s11, s4 2290; GFX11-NEXT: s_add_i32 s3, s7, s3 2291; GFX11-NEXT: s_add_i32 s7, s15, s14 2292; GFX11-NEXT: s_mul_i32 s6, s8, s6 2293; GFX11-NEXT: s_mul_i32 s10, s10, s4 2294; GFX11-NEXT: s_add_i32 s3, s3, s12 2295; GFX11-NEXT: s_add_i32 s7, s7, s11 2296; GFX11-NEXT: s_mul_i32 s19, s5, s8 2297; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8 2298; GFX11-NEXT: s_add_u32 s6, s10, s6 2299; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8 2300; GFX11-NEXT: s_addc_u32 s7, s7, s3 2301; GFX11-NEXT: s_mul_i32 s17, s4, s9 2302; GFX11-NEXT: s_add_u32 s3, s19, s20 2303; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9 2304; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9 2305; GFX11-NEXT: s_mul_i32 s5, s5, s9 2306; GFX11-NEXT: s_addc_u32 s9, s18, 0 2307; GFX11-NEXT: s_add_u32 s3, s17, s3 2308; GFX11-NEXT: s_addc_u32 s10, s16, 0 2309; GFX11-NEXT: s_mul_i32 s12, s4, s8 2310; GFX11-NEXT: s_add_u32 s4, s9, s10 2311; GFX11-NEXT: s_addc_u32 s8, 0, 0 2312; GFX11-NEXT: s_add_u32 s4, s5, s4 2313; GFX11-NEXT: s_addc_u32 s5, s21, s8 2314; GFX11-NEXT: s_add_u32 s4, s4, s6 2315; GFX11-NEXT: s_addc_u32 s5, s5, s7 2316; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3] 2317; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2318; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3 2319; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5 2320; GFX11-NEXT: s_mov_b32 s3, 0x31016000 2321; GFX11-NEXT: s_mov_b32 s2, -1 2322; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 2323; GFX11-NEXT: s_nop 0 2324; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2325; GFX11-NEXT: s_endpgm 2326; 2327; EG-LABEL: s_mul_i128: 2328; EG: ; %bb.0: ; %entry 2329; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] 2330; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2331; EG-NEXT: CF_END 2332; EG-NEXT: PAD 2333; EG-NEXT: ALU clause starting at 4: 2334; EG-NEXT: MULLO_INT * T0.X, KC0[5].X, KC0[8].X, 2335; EG-NEXT: MULHI * T0.Y, KC0[5].X, KC0[8].X, 2336; EG-NEXT: MULLO_INT * T0.Z, KC0[8].Y, KC0[4].W, 2337; EG-NEXT: MULLO_INT * T0.W, KC0[8].X, KC0[5].Y, 2338; EG-NEXT: MULHI * T1.X, KC0[5].X, KC0[7].W, 2339; EG-NEXT: MULHI * T1.Y, KC0[4].W, KC0[8].X, 2340; EG-NEXT: MULHI * T1.Z, KC0[8].Y, KC0[4].W, 2341; EG-NEXT: MULLO_INT * T1.W, KC0[8].Y, KC0[5].X, 2342; EG-NEXT: MULHI * T2.X, KC0[7].W, KC0[5].Y, 2343; EG-NEXT: MULLO_INT * T2.Y, KC0[5].X, KC0[7].W, 2344; EG-NEXT: MULHI * T2.Z, KC0[4].W, KC0[7].W, 2345; EG-NEXT: ADD_INT T2.W, T2.Y, PS, 2346; EG-NEXT: MULLO_INT * T3.X, KC0[4].W, KC0[8].X, 2347; EG-NEXT: ADDC_UINT T2.Z, T2.Y, T2.Z, 2348; EG-NEXT: ADDC_UINT T3.W, PS, PV.W, 2349; EG-NEXT: MULLO_INT * T2.Y, KC0[7].W, KC0[5].Z, 2350; EG-NEXT: ADD_INT T2.X, T2.X, PS, 2351; EG-NEXT: ADD_INT T2.Y, T1.Z, T1.W, 2352; EG-NEXT: ADD_INT T1.Z, T1.Y, PV.W, 2353; EG-NEXT: ADD_INT T1.W, T1.X, PV.Z, BS:VEC_120/SCL_212 2354; EG-NEXT: MULLO_INT * T1.X, KC0[8].Z, KC0[4].W, 2355; EG-NEXT: ADD_INT T4.X, PV.W, PV.Z, 2356; EG-NEXT: ADDC_UINT T1.Y, PV.W, PV.Z, 2357; EG-NEXT: ADD_INT T1.Z, PV.Y, PS, 2358; EG-NEXT: ADD_INT T0.W, PV.X, T0.W, 2359; EG-NEXT: MULLO_INT * T1.X, KC0[7].W, KC0[5].Y, 2360; EG-NEXT: ADD_INT T2.Y, PV.Z, PV.W, 2361; EG-NEXT: ADDC_UINT T1.Z, T0.Z, PS, 2362; EG-NEXT: ADD_INT T0.W, T0.Y, PV.Y, 2363; EG-NEXT: ADDC_UINT * T1.W, T0.X, PV.X, 2364; EG-NEXT: ADD_INT T0.Y, T0.X, T4.X, 2365; EG-NEXT: ADD_INT T0.Z, T0.Z, T1.X, BS:VEC_021/SCL_122 2366; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2367; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 2368; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2369; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 2370; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 2371; EG-NEXT: ADD_INT * T0.Z, T0.Y, T0.Z, 2372; EG-NEXT: ADD_INT * T0.Y, T3.X, T2.W, 2373; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2374; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2375; EG-NEXT: MULLO_INT * T0.X, KC0[4].W, KC0[7].W, 2376entry: 2377 %mul = mul i128 %a, %b 2378 store i128 %mul, ptr addrspace(1) %out 2379 ret void 2380} 2381 2382define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 { 2383; SI-LABEL: v_mul_i128: 2384; SI: ; %bb.0: ; %entry 2385; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2386; SI-NEXT: s_mov_b32 s7, 0xf000 2387; SI-NEXT: s_mov_b32 s6, 0 2388; SI-NEXT: v_lshlrev_b32_e32 v8, 4, v0 2389; SI-NEXT: v_mov_b32_e32 v9, 0 2390; SI-NEXT: s_waitcnt lgkmcnt(0) 2391; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 2392; SI-NEXT: s_mov_b64 s[0:1], s[2:3] 2393; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 2394; SI-NEXT: buffer_load_dwordx4 v[0:3], v[8:9], s[4:7], 0 addr64 2395; SI-NEXT: buffer_load_dwordx4 v[4:7], v[8:9], s[0:3], 0 addr64 2396; SI-NEXT: s_waitcnt vmcnt(0) 2397; SI-NEXT: v_mul_lo_u32 v3, v4, v3 2398; SI-NEXT: v_mul_hi_u32 v10, v4, v2 2399; SI-NEXT: v_mul_lo_u32 v12, v6, v1 2400; SI-NEXT: v_mul_hi_u32 v13, v6, v0 2401; SI-NEXT: v_mul_lo_u32 v17, v1, v4 2402; SI-NEXT: v_mul_hi_u32 v18, v0, v4 2403; SI-NEXT: v_mul_lo_u32 v11, v5, v2 2404; SI-NEXT: v_mul_lo_u32 v7, v7, v0 2405; SI-NEXT: v_mul_hi_u32 v16, v1, v4 2406; SI-NEXT: v_mul_lo_u32 v15, v0, v5 2407; SI-NEXT: v_mul_hi_u32 v14, v0, v5 2408; SI-NEXT: v_mul_hi_u32 v19, v1, v5 2409; SI-NEXT: v_mul_lo_u32 v5, v1, v5 2410; SI-NEXT: v_add_i32_e32 v1, vcc, v10, v3 2411; SI-NEXT: v_add_i32_e32 v3, vcc, v13, v12 2412; SI-NEXT: v_mul_lo_u32 v2, v4, v2 2413; SI-NEXT: v_mul_lo_u32 v6, v6, v0 2414; SI-NEXT: v_mul_lo_u32 v0, v0, v4 2415; SI-NEXT: v_add_i32_e32 v4, vcc, v17, v18 2416; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v16, vcc 2417; SI-NEXT: v_add_i32_e32 v11, vcc, v1, v11 2418; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2419; SI-NEXT: v_add_i32_e32 v1, vcc, v15, v4 2420; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc 2421; SI-NEXT: v_add_i32_e32 v2, vcc, v6, v2 2422; SI-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc 2423; SI-NEXT: v_add_i32_e32 v4, vcc, v10, v4 2424; SI-NEXT: v_addc_u32_e64 v6, s[4:5], 0, 0, vcc 2425; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2426; SI-NEXT: v_addc_u32_e32 v5, vcc, v19, v6, vcc 2427; SI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 2428; SI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 2429; SI-NEXT: buffer_store_dwordx4 v[0:3], v[8:9], s[0:3], 0 addr64 2430; SI-NEXT: s_endpgm 2431; 2432; VI-LABEL: v_mul_i128: 2433; VI: ; %bb.0: ; %entry 2434; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2435; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0 2436; VI-NEXT: v_mov_b32_e32 v11, 0 2437; VI-NEXT: s_waitcnt lgkmcnt(0) 2438; VI-NEXT: v_mov_b32_e32 v1, s1 2439; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2440; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2441; VI-NEXT: v_mov_b32_e32 v3, s3 2442; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2 2443; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc 2444; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2445; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9] 2446; VI-NEXT: s_waitcnt vmcnt(0) 2447; VI-NEXT: v_mul_lo_u32 v10, v4, v3 2448; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v4, v2, 0 2449; VI-NEXT: v_mul_lo_u32 v14, v5, v2 2450; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0 2451; VI-NEXT: v_mul_lo_u32 v15, v7, v0 2452; VI-NEXT: v_add_u32_e32 v7, vcc, v13, v10 2453; VI-NEXT: v_mov_b32_e32 v10, v3 2454; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v4, v[10:11] 2455; VI-NEXT: v_add_u32_e32 v13, vcc, v7, v14 2456; VI-NEXT: v_mov_b32_e32 v7, v4 2457; VI-NEXT: v_mov_b32_e32 v4, v11 2458; VI-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v6, v0, v[12:13] 2459; VI-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v0, v5, v[3:4] 2460; VI-NEXT: v_add_u32_e32 v11, vcc, v15, v13 2461; VI-NEXT: v_mov_b32_e32 v0, v4 2462; VI-NEXT: v_mul_lo_u32 v10, v6, v1 2463; VI-NEXT: v_add_u32_e32 v6, vcc, v7, v0 2464; VI-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, vcc 2465; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 2466; VI-NEXT: v_add_u32_e32 v5, vcc, v10, v11 2467; VI-NEXT: v_add_u32_e32 v4, vcc, v0, v12 2468; VI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc 2469; VI-NEXT: flat_store_dwordx4 v[8:9], v[2:5] 2470; VI-NEXT: s_endpgm 2471; 2472; GFX9-LABEL: v_mul_i128: 2473; GFX9: ; %bb.0: ; %entry 2474; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2475; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0 2476; GFX9-NEXT: v_mov_b32_e32 v10, 0 2477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2478; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1] 2479; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3] 2480; GFX9-NEXT: s_waitcnt vmcnt(0) 2481; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0 2482; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2 2483; GFX9-NEXT: v_mul_lo_u32 v15, v4, v3 2484; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[0:1], v1, v4, v[9:10] 2485; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0 2486; GFX9-NEXT: v_mov_b32_e32 v4, v12 2487; GFX9-NEXT: v_mov_b32_e32 v12, v10 2488; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v0, v5, v[11:12] 2489; GFX9-NEXT: v_add3_u32 v3, v3, v15, v14 2490; GFX9-NEXT: v_mul_lo_u32 v17, v7, v0 2491; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v0, v[2:3] 2492; GFX9-NEXT: v_mov_b32_e32 v0, v10 2493; GFX9-NEXT: v_mul_lo_u32 v16, v6, v1 2494; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v0 2495; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, 0, vcc 2496; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v5, v[6:7] 2497; GFX9-NEXT: v_add3_u32 v3, v17, v3, v16 2498; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2 2499; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc 2500; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3] 2501; GFX9-NEXT: s_endpgm 2502; 2503; GFX10-LABEL: v_mul_i128: 2504; GFX10: ; %bb.0: ; %entry 2505; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c 2506; GFX10-NEXT: v_lshlrev_b32_e32 v14, 4, v0 2507; GFX10-NEXT: v_mov_b32_e32 v10, 0 2508; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2509; GFX10-NEXT: s_clause 0x1 2510; GFX10-NEXT: global_load_dwordx4 v[0:3], v14, s[0:1] 2511; GFX10-NEXT: global_load_dwordx4 v[4:7], v14, s[2:3] 2512; GFX10-NEXT: s_waitcnt vmcnt(0) 2513; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0 2514; GFX10-NEXT: v_mul_lo_u32 v7, v7, v0 2515; GFX10-NEXT: v_mad_u64_u32 v[11:12], s0, v1, v4, v[9:10] 2516; GFX10-NEXT: v_mov_b32_e32 v9, v12 2517; GFX10-NEXT: v_mov_b32_e32 v12, v10 2518; GFX10-NEXT: v_mul_lo_u32 v10, v5, v2 2519; GFX10-NEXT: v_mad_u64_u32 v[12:13], s0, v0, v5, v[11:12] 2520; GFX10-NEXT: v_mul_lo_u32 v11, v4, v3 2521; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v2, 0 2522; GFX10-NEXT: v_mov_b32_e32 v4, v13 2523; GFX10-NEXT: v_mul_lo_u32 v13, v6, v1 2524; GFX10-NEXT: v_add3_u32 v3, v3, v11, v10 2525; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v4 2526; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s0, 0, 0, s0 2527; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v6, v0, v[2:3] 2528; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v5, v[9:10] 2529; GFX10-NEXT: v_mov_b32_e32 v9, v12 2530; GFX10-NEXT: v_add3_u32 v3, v7, v3, v13 2531; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 2532; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo 2533; GFX10-NEXT: global_store_dwordx4 v14, v[8:11], s[2:3] 2534; GFX10-NEXT: s_endpgm 2535; 2536; GFX11-LABEL: v_mul_i128: 2537; GFX11: ; %bb.0: ; %entry 2538; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c 2539; GFX11-NEXT: v_lshlrev_b32_e32 v16, 4, v0 2540; GFX11-NEXT: v_mov_b32_e32 v10, 0 2541; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX11-NEXT: s_clause 0x1 2543; GFX11-NEXT: global_load_b128 v[0:3], v16, s[0:1] 2544; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] 2545; GFX11-NEXT: s_waitcnt vmcnt(0) 2546; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0 2547; GFX11-NEXT: v_mul_lo_u32 v15, v5, v2 2548; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3 2549; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2550; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v1, v4, v[9:10] 2551; GFX11-NEXT: v_dual_mov_b32 v9, v12 :: v_dual_mov_b32 v12, v10 2552; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) 2553; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v0, v5, v[11:12] 2554; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v4, v2, 0 2555; GFX11-NEXT: v_mul_lo_u32 v4, v6, v1 2556; GFX11-NEXT: v_mul_lo_u32 v12, v7, v0 2557; GFX11-NEXT: v_mov_b32_e32 v2, v14 2558; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 2559; GFX11-NEXT: v_add3_u32 v11, v11, v3, v15 2560; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 2561; GFX11-NEXT: v_mov_b32_e32 v9, v13 2562; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 2563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) 2564; GFX11-NEXT: v_mad_u64_u32 v[14:15], null, v6, v0, v[10:11] 2565; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v1, v5, v[2:3] 2566; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2567; GFX11-NEXT: v_add3_u32 v0, v12, v15, v4 2568; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v14 2569; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2570; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo 2571; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] 2572; GFX11-NEXT: s_nop 0 2573; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 2574; GFX11-NEXT: s_endpgm 2575; 2576; EG-LABEL: v_mul_i128: 2577; EG: ; %bb.0: ; %entry 2578; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] 2579; EG-NEXT: TEX 1 @6 2580; EG-NEXT: ALU 41, @14, KC0[], KC1[] 2581; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2582; EG-NEXT: CF_END 2583; EG-NEXT: PAD 2584; EG-NEXT: Fetch clause starting at 6: 2585; EG-NEXT: VTX_READ_128 T2.XYZW, T1.X, 0, #1 2586; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2587; EG-NEXT: ALU clause starting at 10: 2588; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2589; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 2590; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, 2591; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, 2592; EG-NEXT: ALU clause starting at 14: 2593; EG-NEXT: MULLO_INT * T1.Y, T0.Y, T2.Y, 2594; EG-NEXT: MULHI * T1.Z, T0.Y, T2.Y, 2595; EG-NEXT: MULLO_INT * T1.W, T2.Z, T0.X, 2596; EG-NEXT: MULLO_INT * T3.X, T2.Y, T0.Z, 2597; EG-NEXT: MULHI * T3.Y, T0.Y, T2.X, 2598; EG-NEXT: MULHI * T3.Z, T0.X, T2.Y, 2599; EG-NEXT: MULHI * T3.W, T2.Z, T0.X, 2600; EG-NEXT: MULLO_INT * T2.Z, T2.Z, T0.Y, 2601; EG-NEXT: MULHI * T4.X, T2.X, T0.Z, 2602; EG-NEXT: MULLO_INT * T0.Y, T0.Y, T2.X, 2603; EG-NEXT: MULHI * T4.Y, T0.X, T2.X, 2604; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 2605; EG-NEXT: MULLO_INT * T2.Y, T0.X, T2.Y, 2606; EG-NEXT: ADDC_UINT T4.Z, T0.Y, T4.Y, 2607; EG-NEXT: ADDC_UINT T5.W, PS, PV.W, 2608; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.W, 2609; EG-NEXT: ADD_INT T4.X, T4.X, PS, 2610; EG-NEXT: ADD_INT T0.Y, T3.W, T2.Z, 2611; EG-NEXT: ADD_INT T2.Z, T3.Z, PV.W, 2612; EG-NEXT: ADD_INT T0.W, T3.Y, PV.Z, 2613; EG-NEXT: MULLO_INT * T2.W, T2.W, T0.X, 2614; EG-NEXT: ADD_INT T5.X, PV.W, PV.Z, 2615; EG-NEXT: ADDC_UINT T3.Y, PV.W, PV.Z, 2616; EG-NEXT: ADD_INT T2.Z, PV.Y, PS, 2617; EG-NEXT: ADD_INT T0.W, PV.X, T3.X, 2618; EG-NEXT: MULLO_INT * T0.Y, T2.X, T0.Z, 2619; EG-NEXT: ADD_INT T4.Y, PV.Z, PV.W, 2620; EG-NEXT: ADDC_UINT T0.Z, T1.W, PS, 2621; EG-NEXT: ADD_INT T0.W, T1.Z, PV.Y, 2622; EG-NEXT: ADDC_UINT * T2.W, T1.Y, PV.X, 2623; EG-NEXT: ADD_INT T1.Y, T1.Y, T5.X, 2624; EG-NEXT: ADD_INT T1.Z, T1.W, T0.Y, 2625; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2626; EG-NEXT: ADD_INT * T1.W, PV.Y, PV.Z, 2627; EG-NEXT: ADD_INT T0.W, PV.W, PS, 2628; EG-NEXT: ADDC_UINT * T1.W, PV.Y, PV.Z, 2629; EG-NEXT: ADD_INT * T0.W, PV.W, PS, 2630; EG-NEXT: ADD_INT * T0.Z, T1.Y, T1.Z, 2631; EG-NEXT: ADD_INT * T0.Y, T2.Y, T4.W, 2632; EG-NEXT: LSHR T1.X, T1.X, literal.x, 2633; EG-NEXT: MULLO_INT * T0.X, T0.X, T2.X, 2634; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2635entry: 2636 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2637 %gep.a = getelementptr inbounds i128, ptr addrspace(1) %aptr, i32 %tid 2638 %gep.b = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 2639 %gep.out = getelementptr inbounds i128, ptr addrspace(1) %bptr, i32 %tid 2640 %a = load i128, ptr addrspace(1) %gep.a 2641 %b = load i128, ptr addrspace(1) %gep.b 2642 %mul = mul i128 %a, %b 2643 store i128 %mul, ptr addrspace(1) %gep.out 2644 ret void 2645} 2646 2647declare i32 @llvm.amdgcn.workitem.id.x() #1 2648 2649attributes #0 = { nounwind } 2650attributes #1 = { nounwind readnone} 2651