1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s 7; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s 8; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 9 10define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { 11; GFX7-LABEL: s_mul_i16: 12; GFX7: ; %bb.0: 13; GFX7-NEXT: s_mul_i32 s0, s0, s1 14; GFX7-NEXT: ; return to shader part epilog 15; 16; GFX8-LABEL: s_mul_i16: 17; GFX8: ; %bb.0: 18; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 19; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 20; GFX8-NEXT: s_mul_i32 s0, s0, s1 21; GFX8-NEXT: ; return to shader part epilog 22; 23; GFX9-LABEL: s_mul_i16: 24; GFX9: ; %bb.0: 25; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 26; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 27; GFX9-NEXT: s_mul_i32 s0, s0, s1 28; GFX9-NEXT: ; return to shader part epilog 29; 30; GFX10PLUS-LABEL: s_mul_i16: 31; GFX10PLUS: ; %bb.0: 32; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 33; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff 34; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 35; GFX10PLUS-NEXT: ; return to shader part epilog 36; 37; GFX12-LABEL: s_mul_i16: 38; GFX12: ; %bb.0: 39; GFX12-NEXT: s_and_b32 s0, s0, 0xffff 40; GFX12-NEXT: s_and_b32 s1, s1, 0xffff 41; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 42; GFX12-NEXT: s_mul_i32 s0, s0, s1 43; GFX12-NEXT: ; return to shader part epilog 44 %result = mul i16 %num, %den 45 ret i16 %result 46} 47 48define i16 @v_mul_i16(i16 %num, i16 %den) { 49; GFX7-LABEL: v_mul_i16: 50; GFX7: ; %bb.0: 51; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 53; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 54; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 55; GFX7-NEXT: s_setpc_b64 s[30:31] 56; 57; GFX8-LABEL: v_mul_i16: 58; GFX8: ; %bb.0: 59; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 60; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 61; GFX8-NEXT: s_setpc_b64 s[30:31] 62; 63; GFX9-LABEL: v_mul_i16: 64; GFX9: ; %bb.0: 65; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 66; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 67; GFX9-NEXT: s_setpc_b64 s[30:31] 68; 69; GFX10-LABEL: v_mul_i16: 70; GFX10: ; %bb.0: 71; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 72; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 73; GFX10-NEXT: s_setpc_b64 s[30:31] 74; 75; GFX11-TRUE16-LABEL: v_mul_i16: 76; GFX11-TRUE16: ; %bb.0: 77; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l 79; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 80; 81; GFX11-FAKE16-LABEL: v_mul_i16: 82; GFX11-FAKE16: ; %bb.0: 83; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 84; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 85; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 86; 87; GFX12-LABEL: v_mul_i16: 88; GFX12: ; %bb.0: 89; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 90; GFX12-NEXT: s_wait_expcnt 0x0 91; GFX12-NEXT: s_wait_samplecnt 0x0 92; GFX12-NEXT: s_wait_bvhcnt 0x0 93; GFX12-NEXT: s_wait_kmcnt 0x0 94; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 95; GFX12-NEXT: s_setpc_b64 s[30:31] 96 %result = mul i16 %num, %den 97 ret i16 %result 98} 99 100define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) { 101; GFX7-LABEL: s_mul_i16_zeroext: 102; GFX7: ; %bb.0: 103; GFX7-NEXT: s_mul_i32 s0, s0, s1 104; GFX7-NEXT: s_and_b32 s0, s0, 0xffff 105; GFX7-NEXT: ; return to shader part epilog 106; 107; GFX8-LABEL: s_mul_i16_zeroext: 108; GFX8: ; %bb.0: 109; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 110; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 111; GFX8-NEXT: s_mul_i32 s0, s0, s1 112; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 113; GFX8-NEXT: ; return to shader part epilog 114; 115; GFX9-LABEL: s_mul_i16_zeroext: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 118; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 119; GFX9-NEXT: s_mul_i32 s0, s0, s1 120; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 121; GFX9-NEXT: ; return to shader part epilog 122; 123; GFX10PLUS-LABEL: s_mul_i16_zeroext: 124; GFX10PLUS: ; %bb.0: 125; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 126; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff 127; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 128; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 129; GFX10PLUS-NEXT: ; return to shader part epilog 130; 131; GFX12-LABEL: s_mul_i16_zeroext: 132; GFX12: ; %bb.0: 133; GFX12-NEXT: s_and_b32 s0, s0, 0xffff 134; GFX12-NEXT: s_and_b32 s1, s1, 0xffff 135; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 136; GFX12-NEXT: s_mul_i32 s0, s0, s1 137; GFX12-NEXT: s_and_b32 s0, s0, 0xffff 138; GFX12-NEXT: ; return to shader part epilog 139 %result = mul i16 %num, %den 140 ret i16 %result 141} 142 143define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { 144; GFX7-LABEL: v_mul_i16_zeroext: 145; GFX7: ; %bb.0: 146; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 147; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 148; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 149; GFX7-NEXT: s_setpc_b64 s[30:31] 150; 151; GFX8-LABEL: v_mul_i16_zeroext: 152; GFX8: ; %bb.0: 153; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 155; GFX8-NEXT: s_setpc_b64 s[30:31] 156; 157; GFX9-LABEL: v_mul_i16_zeroext: 158; GFX9: ; %bb.0: 159; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 161; GFX9-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-LABEL: v_mul_i16_zeroext: 164; GFX10: ; %bb.0: 165; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 167; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 168; GFX10-NEXT: s_setpc_b64 s[30:31] 169; 170; GFX11-TRUE16-LABEL: v_mul_i16_zeroext: 171; GFX11-TRUE16: ; %bb.0: 172; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l 174; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 175; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX11-FAKE16-LABEL: v_mul_i16_zeroext: 178; GFX11-FAKE16: ; %bb.0: 179; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 181; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 182; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 183; 184; GFX12-LABEL: v_mul_i16_zeroext: 185; GFX12: ; %bb.0: 186; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 187; GFX12-NEXT: s_wait_expcnt 0x0 188; GFX12-NEXT: s_wait_samplecnt 0x0 189; GFX12-NEXT: s_wait_bvhcnt 0x0 190; GFX12-NEXT: s_wait_kmcnt 0x0 191; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 192; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 193; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 194; GFX12-NEXT: s_setpc_b64 s[30:31] 195 %result = mul i16 %num, %den 196 ret i16 %result 197} 198 199define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) { 200; GFX7-LABEL: s_mul_i16_signext: 201; GFX7: ; %bb.0: 202; GFX7-NEXT: s_mul_i32 s0, s0, s1 203; GFX7-NEXT: s_sext_i32_i16 s0, s0 204; GFX7-NEXT: ; return to shader part epilog 205; 206; GFX8-LABEL: s_mul_i16_signext: 207; GFX8: ; %bb.0: 208; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 209; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 210; GFX8-NEXT: s_mul_i32 s0, s0, s1 211; GFX8-NEXT: s_sext_i32_i16 s0, s0 212; GFX8-NEXT: ; return to shader part epilog 213; 214; GFX9-LABEL: s_mul_i16_signext: 215; GFX9: ; %bb.0: 216; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 217; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 218; GFX9-NEXT: s_mul_i32 s0, s0, s1 219; GFX9-NEXT: s_sext_i32_i16 s0, s0 220; GFX9-NEXT: ; return to shader part epilog 221; 222; GFX10PLUS-LABEL: s_mul_i16_signext: 223; GFX10PLUS: ; %bb.0: 224; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 225; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff 226; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 227; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0 228; GFX10PLUS-NEXT: ; return to shader part epilog 229; 230; GFX12-LABEL: s_mul_i16_signext: 231; GFX12: ; %bb.0: 232; GFX12-NEXT: s_and_b32 s0, s0, 0xffff 233; GFX12-NEXT: s_and_b32 s1, s1, 0xffff 234; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 235; GFX12-NEXT: s_mul_i32 s0, s0, s1 236; GFX12-NEXT: s_sext_i32_i16 s0, s0 237; GFX12-NEXT: ; return to shader part epilog 238 %result = mul i16 %num, %den 239 ret i16 %result 240} 241 242define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { 243; GFX7-LABEL: v_mul_i16_signext: 244; GFX7: ; %bb.0: 245; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 246; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 247; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 248; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1 249; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 250; GFX7-NEXT: s_setpc_b64 s[30:31] 251; 252; GFX8-LABEL: v_mul_i16_signext: 253; GFX8: ; %bb.0: 254; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 255; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1 256; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 257; GFX8-NEXT: s_setpc_b64 s[30:31] 258; 259; GFX9-LABEL: v_mul_i16_signext: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1 263; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 264; GFX9-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX10-LABEL: v_mul_i16_signext: 267; GFX10: ; %bb.0: 268; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1 270; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 271; GFX10-NEXT: s_setpc_b64 s[30:31] 272; 273; GFX11-TRUE16-LABEL: v_mul_i16_signext: 274; GFX11-TRUE16: ; %bb.0: 275; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 276; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l 277; GFX11-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 278; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] 279; 280; GFX11-FAKE16-LABEL: v_mul_i16_signext: 281; GFX11-FAKE16: ; %bb.0: 282; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 283; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 284; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 285; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] 286; 287; GFX12-LABEL: v_mul_i16_signext: 288; GFX12: ; %bb.0: 289; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 290; GFX12-NEXT: s_wait_expcnt 0x0 291; GFX12-NEXT: s_wait_samplecnt 0x0 292; GFX12-NEXT: s_wait_bvhcnt 0x0 293; GFX12-NEXT: s_wait_kmcnt 0x0 294; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1 295; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 296; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 297; GFX12-NEXT: s_setpc_b64 s[30:31] 298 %result = mul i16 %num, %den 299 ret i16 %result 300} 301 302define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { 303; GCN-LABEL: s_mul_i32: 304; GCN: ; %bb.0: 305; GCN-NEXT: s_mul_i32 s0, s0, s1 306; GCN-NEXT: ; return to shader part epilog 307; 308; GFX10PLUS-LABEL: s_mul_i32: 309; GFX10PLUS: ; %bb.0: 310; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1 311; GFX10PLUS-NEXT: ; return to shader part epilog 312; 313; GFX12-LABEL: s_mul_i32: 314; GFX12: ; %bb.0: 315; GFX12-NEXT: s_mul_i32 s0, s0, s1 316; GFX12-NEXT: ; return to shader part epilog 317 %result = mul i32 %num, %den 318 ret i32 %result 319} 320 321define i32 @v_mul_i32(i32 %num, i32 %den) { 322; GCN-LABEL: v_mul_i32: 323; GCN: ; %bb.0: 324; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 326; GCN-NEXT: s_setpc_b64 s[30:31] 327; 328; GFX10PLUS-LABEL: v_mul_i32: 329; GFX10PLUS: ; %bb.0: 330; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 331; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1 332; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 333; 334; GFX12-LABEL: v_mul_i32: 335; GFX12: ; %bb.0: 336; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 337; GFX12-NEXT: s_wait_expcnt 0x0 338; GFX12-NEXT: s_wait_samplecnt 0x0 339; GFX12-NEXT: s_wait_bvhcnt 0x0 340; GFX12-NEXT: s_wait_kmcnt 0x0 341; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1 342; GFX12-NEXT: s_setpc_b64 s[30:31] 343 %result = mul i32 %num, %den 344 ret i32 %result 345} 346 347define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { 348; GCN-LABEL: s_mul_v2i32: 349; GCN: ; %bb.0: 350; GCN-NEXT: s_mul_i32 s0, s0, s2 351; GCN-NEXT: s_mul_i32 s1, s1, s3 352; GCN-NEXT: ; return to shader part epilog 353; 354; GFX10PLUS-LABEL: s_mul_v2i32: 355; GFX10PLUS: ; %bb.0: 356; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 357; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3 358; GFX10PLUS-NEXT: ; return to shader part epilog 359; 360; GFX12-LABEL: s_mul_v2i32: 361; GFX12: ; %bb.0: 362; GFX12-NEXT: s_mul_i32 s0, s0, s2 363; GFX12-NEXT: s_mul_i32 s1, s1, s3 364; GFX12-NEXT: ; return to shader part epilog 365 %result = mul <2 x i32> %num, %den 366 ret <2 x i32> %result 367} 368 369define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { 370; GCN-LABEL: v_mul_v2i32: 371; GCN: ; %bb.0: 372; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 373; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 374; GCN-NEXT: v_mul_lo_u32 v1, v1, v3 375; GCN-NEXT: s_setpc_b64 s[30:31] 376; 377; GFX10PLUS-LABEL: v_mul_v2i32: 378; GFX10PLUS: ; %bb.0: 379; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 380; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2 381; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3 382; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 383; 384; GFX12-LABEL: v_mul_v2i32: 385; GFX12: ; %bb.0: 386; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 387; GFX12-NEXT: s_wait_expcnt 0x0 388; GFX12-NEXT: s_wait_samplecnt 0x0 389; GFX12-NEXT: s_wait_bvhcnt 0x0 390; GFX12-NEXT: s_wait_kmcnt 0x0 391; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 392; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3 393; GFX12-NEXT: s_setpc_b64 s[30:31] 394 %result = mul <2 x i32> %num, %den 395 ret <2 x i32> %result 396} 397 398define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { 399; GFX7-LABEL: s_mul_i33: 400; GFX7: ; %bb.0: 401; GFX7-NEXT: v_mov_b32_e32 v0, s2 402; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 403; GFX7-NEXT: s_mul_i32 s4, s0, s2 404; GFX7-NEXT: s_mul_i32 s0, s0, s3 405; GFX7-NEXT: s_mul_i32 s1, s1, s2 406; GFX7-NEXT: v_readfirstlane_b32 s5, v0 407; GFX7-NEXT: s_add_u32 s0, s0, s5 408; GFX7-NEXT: s_add_u32 s1, s1, s0 409; GFX7-NEXT: s_mov_b32 s0, s4 410; GFX7-NEXT: ; return to shader part epilog 411; 412; GFX8-LABEL: s_mul_i33: 413; GFX8: ; %bb.0: 414; GFX8-NEXT: v_mov_b32_e32 v0, s2 415; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 416; GFX8-NEXT: s_mul_i32 s4, s0, s2 417; GFX8-NEXT: s_mul_i32 s0, s0, s3 418; GFX8-NEXT: s_mul_i32 s1, s1, s2 419; GFX8-NEXT: v_readfirstlane_b32 s5, v0 420; GFX8-NEXT: s_add_u32 s0, s0, s5 421; GFX8-NEXT: s_add_u32 s1, s1, s0 422; GFX8-NEXT: s_mov_b32 s0, s4 423; GFX8-NEXT: ; return to shader part epilog 424; 425; GFX9-LABEL: s_mul_i33: 426; GFX9: ; %bb.0: 427; GFX9-NEXT: s_mul_i32 s4, s0, s2 428; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2 429; GFX9-NEXT: s_mul_i32 s0, s0, s3 430; GFX9-NEXT: s_add_u32 s0, s0, s5 431; GFX9-NEXT: s_mul_i32 s1, s1, s2 432; GFX9-NEXT: s_add_u32 s1, s1, s0 433; GFX9-NEXT: s_mov_b32 s0, s4 434; GFX9-NEXT: ; return to shader part epilog 435; 436; GFX10PLUS-LABEL: s_mul_i33: 437; GFX10PLUS: ; %bb.0: 438; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 439; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 440; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 441; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 442; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 443; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 444; GFX10PLUS-NEXT: ; return to shader part epilog 445; 446; GFX12-LABEL: s_mul_i33: 447; GFX12: ; %bb.0: 448; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] 449; GFX12-NEXT: ; return to shader part epilog 450 %result = mul i33 %num, %den 451 ret i33 %result 452} 453 454define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { 455; GFX7-LABEL: s_mul_i64: 456; GFX7: ; %bb.0: 457; GFX7-NEXT: v_mov_b32_e32 v0, s2 458; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 459; GFX7-NEXT: s_mul_i32 s4, s0, s2 460; GFX7-NEXT: s_mul_i32 s0, s0, s3 461; GFX7-NEXT: s_mul_i32 s1, s1, s2 462; GFX7-NEXT: v_readfirstlane_b32 s5, v0 463; GFX7-NEXT: s_add_u32 s0, s0, s5 464; GFX7-NEXT: s_add_u32 s1, s1, s0 465; GFX7-NEXT: s_mov_b32 s0, s4 466; GFX7-NEXT: ; return to shader part epilog 467; 468; GFX8-LABEL: s_mul_i64: 469; GFX8: ; %bb.0: 470; GFX8-NEXT: v_mov_b32_e32 v0, s2 471; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 472; GFX8-NEXT: s_mul_i32 s4, s0, s2 473; GFX8-NEXT: s_mul_i32 s0, s0, s3 474; GFX8-NEXT: s_mul_i32 s1, s1, s2 475; GFX8-NEXT: v_readfirstlane_b32 s5, v0 476; GFX8-NEXT: s_add_u32 s0, s0, s5 477; GFX8-NEXT: s_add_u32 s1, s1, s0 478; GFX8-NEXT: s_mov_b32 s0, s4 479; GFX8-NEXT: ; return to shader part epilog 480; 481; GFX9-LABEL: s_mul_i64: 482; GFX9: ; %bb.0: 483; GFX9-NEXT: s_mul_i32 s4, s0, s2 484; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2 485; GFX9-NEXT: s_mul_i32 s0, s0, s3 486; GFX9-NEXT: s_add_u32 s0, s0, s5 487; GFX9-NEXT: s_mul_i32 s1, s1, s2 488; GFX9-NEXT: s_add_u32 s1, s1, s0 489; GFX9-NEXT: s_mov_b32 s0, s4 490; GFX9-NEXT: ; return to shader part epilog 491; 492; GFX10PLUS-LABEL: s_mul_i64: 493; GFX10PLUS: ; %bb.0: 494; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2 495; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3 496; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2 497; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3 498; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2 499; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1 500; GFX10PLUS-NEXT: ; return to shader part epilog 501; 502; GFX12-LABEL: s_mul_i64: 503; GFX12: ; %bb.0: 504; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] 505; GFX12-NEXT: ; return to shader part epilog 506 %result = mul i64 %num, %den 507 ret i64 %result 508} 509 510define i64 @v_mul_i64(i64 %num, i64 %den) { 511; GCN-LABEL: v_mul_i64: 512; GCN: ; %bb.0: 513; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 514; GCN-NEXT: v_mov_b32_e32 v4, v0 515; GCN-NEXT: v_mov_b32_e32 v5, v1 516; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0 517; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2] 518; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4] 519; GCN-NEXT: s_setpc_b64 s[30:31] 520; 521; GFX10-LABEL: v_mul_i64: 522; GFX10: ; %bb.0: 523; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 524; GFX10-NEXT: v_mov_b32_e32 v4, v0 525; GFX10-NEXT: v_mov_b32_e32 v5, v1 526; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0 527; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2] 528; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4] 529; GFX10-NEXT: s_setpc_b64 s[30:31] 530; 531; GFX11-LABEL: v_mul_i64: 532; GFX11: ; %bb.0: 533; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 534; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2 535; GFX11-NEXT: v_mov_b32_e32 v6, v1 536; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0 537; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2] 538; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8] 539; GFX11-NEXT: s_setpc_b64 s[30:31] 540; 541; GFX12-LABEL: v_mul_i64: 542; GFX12: ; %bb.0: 543; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 544; GFX12-NEXT: s_wait_expcnt 0x0 545; GFX12-NEXT: s_wait_samplecnt 0x0 546; GFX12-NEXT: s_wait_bvhcnt 0x0 547; GFX12-NEXT: s_wait_kmcnt 0x0 548; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2 549; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 550; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5] 551; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2 552; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4] 553; GFX12-NEXT: s_setpc_b64 s[30:31] 554 %result = mul i64 %num, %den 555 ret i64 %result 556} 557 558define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { 559; GFX7-LABEL: s_mul_i96: 560; GFX7: ; %bb.0: 561; GFX7-NEXT: v_mov_b32_e32 v0, s3 562; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 563; GFX7-NEXT: v_mov_b32_e32 v1, s4 564; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1 565; GFX7-NEXT: s_mul_i32 s5, s0, s5 566; GFX7-NEXT: v_readfirstlane_b32 s7, v0 567; GFX7-NEXT: s_mul_i32 s8, s1, s4 568; GFX7-NEXT: v_mov_b32_e32 v0, s1 569; GFX7-NEXT: s_add_u32 s5, s8, s5 570; GFX7-NEXT: s_mul_i32 s2, s2, s3 571; GFX7-NEXT: v_mul_hi_u32 v0, v0, s3 572; GFX7-NEXT: s_mul_i32 s6, s0, s3 573; GFX7-NEXT: s_add_u32 s2, s2, s5 574; GFX7-NEXT: s_mul_i32 s0, s0, s4 575; GFX7-NEXT: v_readfirstlane_b32 s4, v1 576; GFX7-NEXT: s_add_u32 s0, s0, s7 577; GFX7-NEXT: s_addc_u32 s2, s4, s2 578; GFX7-NEXT: s_mul_i32 s1, s1, s3 579; GFX7-NEXT: v_readfirstlane_b32 s3, v0 580; GFX7-NEXT: s_add_u32 s1, s1, s0 581; GFX7-NEXT: s_addc_u32 s2, s3, s2 582; GFX7-NEXT: s_mov_b32 s0, s6 583; GFX7-NEXT: ; return to shader part epilog 584; 585; GFX8-LABEL: s_mul_i96: 586; GFX8: ; %bb.0: 587; GFX8-NEXT: v_mov_b32_e32 v0, s3 588; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 589; GFX8-NEXT: v_mov_b32_e32 v1, s4 590; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 591; GFX8-NEXT: s_mul_i32 s5, s0, s5 592; GFX8-NEXT: v_readfirstlane_b32 s7, v0 593; GFX8-NEXT: s_mul_i32 s8, s1, s4 594; GFX8-NEXT: v_mov_b32_e32 v0, s1 595; GFX8-NEXT: s_add_u32 s5, s8, s5 596; GFX8-NEXT: s_mul_i32 s2, s2, s3 597; GFX8-NEXT: v_mul_hi_u32 v0, v0, s3 598; GFX8-NEXT: s_mul_i32 s6, s0, s3 599; GFX8-NEXT: s_add_u32 s2, s2, s5 600; GFX8-NEXT: s_mul_i32 s0, s0, s4 601; GFX8-NEXT: v_readfirstlane_b32 s4, v1 602; GFX8-NEXT: s_add_u32 s0, s0, s7 603; GFX8-NEXT: s_addc_u32 s2, s4, s2 604; GFX8-NEXT: s_mul_i32 s1, s1, s3 605; GFX8-NEXT: v_readfirstlane_b32 s3, v0 606; GFX8-NEXT: s_add_u32 s1, s1, s0 607; GFX8-NEXT: s_addc_u32 s2, s3, s2 608; GFX8-NEXT: s_mov_b32 s0, s6 609; GFX8-NEXT: ; return to shader part epilog 610; 611; GFX9-LABEL: s_mul_i96: 612; GFX9: ; %bb.0: 613; GFX9-NEXT: s_mul_i32 s5, s0, s5 614; GFX9-NEXT: s_mul_i32 s8, s1, s4 615; GFX9-NEXT: s_add_u32 s5, s8, s5 616; GFX9-NEXT: s_mul_i32 s2, s2, s3 617; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3 618; GFX9-NEXT: s_add_u32 s2, s2, s5 619; GFX9-NEXT: s_mul_i32 s5, s0, s4 620; GFX9-NEXT: s_mul_i32 s6, s0, s3 621; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4 622; GFX9-NEXT: s_add_u32 s4, s5, s7 623; GFX9-NEXT: s_addc_u32 s0, s0, s2 624; GFX9-NEXT: s_mul_i32 s2, s1, s3 625; GFX9-NEXT: s_mul_hi_u32 s3, s1, s3 626; GFX9-NEXT: s_add_u32 s1, s2, s4 627; GFX9-NEXT: s_addc_u32 s2, s3, s0 628; GFX9-NEXT: s_mov_b32 s0, s6 629; GFX9-NEXT: ; return to shader part epilog 630; 631; GFX10PLUS-LABEL: s_mul_i96: 632; GFX10PLUS: ; %bb.0: 633; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5 634; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4 635; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3 636; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7 637; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3 638; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2 639; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4 640; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3 641; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4 642; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7 643; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3 644; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6 645; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3 646; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2 647; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0 648; GFX10PLUS-NEXT: s_mov_b32 s0, s5 649; GFX10PLUS-NEXT: ; return to shader part epilog 650; 651; GFX12-LABEL: s_mul_i96: 652; GFX12: ; %bb.0: 653; GFX12-NEXT: s_mul_i32 s6, s0, s5 654; GFX12-NEXT: s_mul_i32 s7, s1, s4 655; GFX12-NEXT: s_mul_i32 s2, s2, s3 656; GFX12-NEXT: s_add_co_i32 s6, s6, s7 657; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 658; GFX12-NEXT: s_add_co_i32 s6, s6, s2 659; GFX12-NEXT: s_mul_i32 s2, s0, s4 660; GFX12-NEXT: s_mul_i32 s5, s0, s3 661; GFX12-NEXT: s_mul_hi_u32 s0, s0, s4 662; GFX12-NEXT: s_add_co_u32 s2, s2, s7 663; GFX12-NEXT: s_mul_i32 s4, s1, s3 664; GFX12-NEXT: s_add_co_ci_u32 s0, s0, s6 665; GFX12-NEXT: s_mul_hi_u32 s3, s1, s3 666; GFX12-NEXT: s_add_co_u32 s1, s4, s2 667; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0 668; GFX12-NEXT: s_mov_b32 s0, s5 669; GFX12-NEXT: ; return to shader part epilog 670 %result = mul i96 %num, %den 671 %cast = bitcast i96 %result to <3 x i32> 672 ret <3 x i32> %cast 673} 674 675define i96 @v_mul_i96(i96 %num, i96 %den) { 676; GCN-LABEL: v_mul_i96: 677; GCN: ; %bb.0: 678; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 679; GCN-NEXT: v_mov_b32_e32 v6, v0 680; GCN-NEXT: v_mov_b32_e32 v7, v1 681; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0 682; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1] 683; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0 684; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9] 685; GCN-NEXT: v_mov_b32_e32 v2, v8 686; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2] 687; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2] 688; GCN-NEXT: s_setpc_b64 s[30:31] 689; 690; GFX10-LABEL: v_mul_i96: 691; GFX10: ; %bb.0: 692; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 693; GFX10-NEXT: v_mov_b32_e32 v6, v0 694; GFX10-NEXT: v_mov_b32_e32 v7, v1 695; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5 696; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1] 697; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0 698; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9] 699; GFX10-NEXT: v_mov_b32_e32 v2, v8 700; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2] 701; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2] 702; GFX10-NEXT: s_setpc_b64 s[30:31] 703; 704; GFX11-LABEL: v_mul_i96: 705; GFX11: ; %bb.0: 706; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 707; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 708; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5 709; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1] 710; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0 711; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9] 712; GFX11-NEXT: v_mov_b32_e32 v2, v9 713; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2] 714; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2] 715; GFX11-NEXT: s_setpc_b64 s[30:31] 716; 717; GFX12-LABEL: v_mul_i96: 718; GFX12: ; %bb.0: 719; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 720; GFX12-NEXT: s_wait_expcnt 0x0 721; GFX12-NEXT: s_wait_samplecnt 0x0 722; GFX12-NEXT: s_wait_bvhcnt 0x0 723; GFX12-NEXT: s_wait_kmcnt 0x0 724; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 725; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 726; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5 727; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1] 728; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v3, 0 729; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 730; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9] 731; GFX12-NEXT: v_mov_b32_e32 v2, v8 732; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 733; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2] 734; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2] 735; GFX12-NEXT: s_setpc_b64 s[30:31] 736 %result = mul i96 %num, %den 737 ret i96 %result 738} 739 740define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { 741; GFX7-LABEL: s_mul_i128: 742; GFX7: ; %bb.0: 743; GFX7-NEXT: v_mov_b32_e32 v0, s4 744; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 745; GFX7-NEXT: v_mov_b32_e32 v1, s5 746; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1 747; GFX7-NEXT: s_mul_i32 s10, s0, s6 748; GFX7-NEXT: v_readfirstlane_b32 s9, v0 749; GFX7-NEXT: v_mov_b32_e32 v0, s6 750; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0 751; GFX7-NEXT: v_readfirstlane_b32 s13, v2 752; GFX7-NEXT: v_mov_b32_e32 v2, s2 753; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4 754; GFX7-NEXT: s_mul_i32 s12, s1, s5 755; GFX7-NEXT: v_readfirstlane_b32 s11, v0 756; GFX7-NEXT: s_add_u32 s10, s12, s10 757; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1 758; GFX7-NEXT: v_mov_b32_e32 v0, s1 759; GFX7-NEXT: s_addc_u32 s11, s13, s11 760; GFX7-NEXT: s_mul_i32 s12, s2, s4 761; GFX7-NEXT: v_readfirstlane_b32 s13, v2 762; GFX7-NEXT: s_add_u32 s10, s12, s10 763; GFX7-NEXT: v_mul_hi_u32 v0, v0, s4 764; GFX7-NEXT: s_addc_u32 s11, s13, s11 765; GFX7-NEXT: s_mul_i32 s12, s0, s5 766; GFX7-NEXT: v_readfirstlane_b32 s13, v1 767; GFX7-NEXT: s_add_u32 s9, s12, s9 768; GFX7-NEXT: s_addc_u32 s10, s13, s10 769; GFX7-NEXT: s_mul_i32 s13, s1, s4 770; GFX7-NEXT: s_cselect_b32 s12, 1, 0 771; GFX7-NEXT: v_readfirstlane_b32 s14, v0 772; GFX7-NEXT: s_add_u32 s9, s13, s9 773; GFX7-NEXT: s_mul_i32 s8, s0, s4 774; GFX7-NEXT: s_addc_u32 s10, s14, s10 775; GFX7-NEXT: s_mul_i32 s0, s0, s7 776; GFX7-NEXT: s_addc_u32 s0, s11, s0 777; GFX7-NEXT: s_mul_i32 s1, s1, s6 778; GFX7-NEXT: s_cmp_lg_u32 s12, 0 779; GFX7-NEXT: s_addc_u32 s0, s0, s1 780; GFX7-NEXT: s_mul_i32 s2, s2, s5 781; GFX7-NEXT: s_add_u32 s0, s2, s0 782; GFX7-NEXT: s_mul_i32 s3, s3, s4 783; GFX7-NEXT: s_add_u32 s3, s3, s0 784; GFX7-NEXT: s_mov_b32 s0, s8 785; GFX7-NEXT: s_mov_b32 s1, s9 786; GFX7-NEXT: s_mov_b32 s2, s10 787; GFX7-NEXT: ; return to shader part epilog 788; 789; GFX8-LABEL: s_mul_i128: 790; GFX8: ; %bb.0: 791; GFX8-NEXT: v_mov_b32_e32 v0, s4 792; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 793; GFX8-NEXT: v_mov_b32_e32 v1, s5 794; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1 795; GFX8-NEXT: s_mul_i32 s10, s0, s6 796; GFX8-NEXT: v_readfirstlane_b32 s9, v0 797; GFX8-NEXT: v_mov_b32_e32 v0, s6 798; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 799; GFX8-NEXT: v_readfirstlane_b32 s13, v2 800; GFX8-NEXT: v_mov_b32_e32 v2, s2 801; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4 802; GFX8-NEXT: s_mul_i32 s12, s1, s5 803; GFX8-NEXT: v_readfirstlane_b32 s11, v0 804; GFX8-NEXT: s_add_u32 s10, s12, s10 805; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 806; GFX8-NEXT: v_mov_b32_e32 v0, s1 807; GFX8-NEXT: s_addc_u32 s11, s13, s11 808; GFX8-NEXT: s_mul_i32 s12, s2, s4 809; GFX8-NEXT: v_readfirstlane_b32 s13, v2 810; GFX8-NEXT: s_add_u32 s10, s12, s10 811; GFX8-NEXT: v_mul_hi_u32 v0, v0, s4 812; GFX8-NEXT: s_addc_u32 s11, s13, s11 813; GFX8-NEXT: s_mul_i32 s12, s0, s5 814; GFX8-NEXT: v_readfirstlane_b32 s13, v1 815; GFX8-NEXT: s_add_u32 s9, s12, s9 816; GFX8-NEXT: s_addc_u32 s10, s13, s10 817; GFX8-NEXT: s_mul_i32 s13, s1, s4 818; GFX8-NEXT: s_cselect_b32 s12, 1, 0 819; GFX8-NEXT: v_readfirstlane_b32 s14, v0 820; GFX8-NEXT: s_add_u32 s9, s13, s9 821; GFX8-NEXT: s_mul_i32 s8, s0, s4 822; GFX8-NEXT: s_addc_u32 s10, s14, s10 823; GFX8-NEXT: s_mul_i32 s0, s0, s7 824; GFX8-NEXT: s_addc_u32 s0, s11, s0 825; GFX8-NEXT: s_mul_i32 s1, s1, s6 826; GFX8-NEXT: s_cmp_lg_u32 s12, 0 827; GFX8-NEXT: s_addc_u32 s0, s0, s1 828; GFX8-NEXT: s_mul_i32 s2, s2, s5 829; GFX8-NEXT: s_add_u32 s0, s2, s0 830; GFX8-NEXT: s_mul_i32 s3, s3, s4 831; GFX8-NEXT: s_add_u32 s3, s3, s0 832; GFX8-NEXT: s_mov_b32 s0, s8 833; GFX8-NEXT: s_mov_b32 s1, s9 834; GFX8-NEXT: s_mov_b32 s2, s10 835; GFX8-NEXT: ; return to shader part epilog 836; 837; GFX9-LABEL: s_mul_i128: 838; GFX9: ; %bb.0: 839; GFX9-NEXT: s_mul_i32 s10, s0, s6 840; GFX9-NEXT: s_mul_i32 s12, s1, s5 841; GFX9-NEXT: s_mul_hi_u32 s11, s0, s6 842; GFX9-NEXT: s_mul_hi_u32 s13, s1, s5 843; GFX9-NEXT: s_add_u32 s10, s12, s10 844; GFX9-NEXT: s_addc_u32 s11, s13, s11 845; GFX9-NEXT: s_mul_i32 s12, s2, s4 846; GFX9-NEXT: s_mul_hi_u32 s13, s2, s4 847; GFX9-NEXT: s_add_u32 s10, s12, s10 848; GFX9-NEXT: s_mul_hi_u32 s9, s0, s4 849; GFX9-NEXT: s_addc_u32 s11, s13, s11 850; GFX9-NEXT: s_mul_i32 s12, s0, s5 851; GFX9-NEXT: s_mul_hi_u32 s13, s0, s5 852; GFX9-NEXT: s_add_u32 s9, s12, s9 853; GFX9-NEXT: s_addc_u32 s10, s13, s10 854; GFX9-NEXT: s_mul_i32 s13, s1, s4 855; GFX9-NEXT: s_cselect_b32 s12, 1, 0 856; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4 857; GFX9-NEXT: s_add_u32 s9, s13, s9 858; GFX9-NEXT: s_mul_i32 s8, s0, s4 859; GFX9-NEXT: s_addc_u32 s10, s14, s10 860; GFX9-NEXT: s_mul_i32 s0, s0, s7 861; GFX9-NEXT: s_addc_u32 s0, s11, s0 862; GFX9-NEXT: s_mul_i32 s1, s1, s6 863; GFX9-NEXT: s_cmp_lg_u32 s12, 0 864; GFX9-NEXT: s_addc_u32 s0, s0, s1 865; GFX9-NEXT: s_mul_i32 s2, s2, s5 866; GFX9-NEXT: s_add_u32 s0, s2, s0 867; GFX9-NEXT: s_mul_i32 s3, s3, s4 868; GFX9-NEXT: s_add_u32 s3, s3, s0 869; GFX9-NEXT: s_mov_b32 s0, s8 870; GFX9-NEXT: s_mov_b32 s1, s9 871; GFX9-NEXT: s_mov_b32 s2, s10 872; GFX9-NEXT: ; return to shader part epilog 873; 874; GFX10PLUS-LABEL: s_mul_i128: 875; GFX10PLUS: ; %bb.0: 876; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6 877; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5 878; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6 879; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5 880; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 881; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4 882; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 883; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4 884; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4 885; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9 886; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5 887; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10 888; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5 889; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8 890; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9 891; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4 892; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4 893; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 894; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8 895; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7 896; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9 897; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12 898; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6 899; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 900; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5 901; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1 902; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4 903; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2 904; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4 905; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3 906; GFX10PLUS-NEXT: s_mov_b32 s1, s8 907; GFX10PLUS-NEXT: s_mov_b32 s2, s7 908; GFX10PLUS-NEXT: ; return to shader part epilog 909; 910; GFX12-LABEL: s_mul_i128: 911; GFX12: ; %bb.0: 912; GFX12-NEXT: s_mul_i32 s9, s0, s6 913; GFX12-NEXT: s_mul_i32 s11, s1, s5 914; GFX12-NEXT: s_mul_hi_u32 s10, s0, s6 915; GFX12-NEXT: s_mul_hi_u32 s12, s1, s5 916; GFX12-NEXT: s_add_co_u32 s9, s11, s9 917; GFX12-NEXT: s_mul_i32 s11, s2, s4 918; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10 919; GFX12-NEXT: s_mul_hi_u32 s12, s2, s4 920; GFX12-NEXT: s_mul_hi_u32 s8, s0, s4 921; GFX12-NEXT: s_add_co_u32 s9, s11, s9 922; GFX12-NEXT: s_mul_i32 s11, s0, s5 923; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10 924; GFX12-NEXT: s_mul_hi_u32 s12, s0, s5 925; GFX12-NEXT: s_add_co_u32 s8, s11, s8 926; GFX12-NEXT: s_add_co_ci_u32 s9, s12, s9 927; GFX12-NEXT: s_mul_i32 s12, s1, s4 928; GFX12-NEXT: s_mul_hi_u32 s13, s1, s4 929; GFX12-NEXT: s_cselect_b32 s11, 1, 0 930; GFX12-NEXT: s_add_co_u32 s8, s12, s8 931; GFX12-NEXT: s_mul_i32 s12, s0, s7 932; GFX12-NEXT: s_add_co_ci_u32 s7, s13, s9 933; GFX12-NEXT: s_add_co_ci_u32 s9, s10, s12 934; GFX12-NEXT: s_mul_i32 s1, s1, s6 935; GFX12-NEXT: s_cmp_lg_u32 s11, 0 936; GFX12-NEXT: s_mul_i32 s2, s2, s5 937; GFX12-NEXT: s_add_co_ci_u32 s1, s9, s1 938; GFX12-NEXT: s_mul_i32 s3, s3, s4 939; GFX12-NEXT: s_add_co_i32 s1, s1, s2 940; GFX12-NEXT: s_mul_i32 s0, s0, s4 941; GFX12-NEXT: s_add_co_i32 s3, s1, s3 942; GFX12-NEXT: s_mov_b32 s1, s8 943; GFX12-NEXT: s_mov_b32 s2, s7 944; GFX12-NEXT: ; return to shader part epilog 945 %result = mul i128 %num, %den 946 %cast = bitcast i128 %result to <4 x i32> 947 ret <4 x i32> %cast 948} 949 950define i128 @v_mul_i128(i128 %num, i128 %den) { 951; GFX7-LABEL: v_mul_i128: 952; GFX7: ; %bb.0: 953; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 954; GFX7-NEXT: v_mov_b32_e32 v8, v0 955; GFX7-NEXT: v_mov_b32_e32 v9, v1 956; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 957; GFX7-NEXT: v_mov_b32_e32 v10, v2 958; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7 959; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] 960; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 961; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] 962; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6 963; GFX7-NEXT: v_mov_b32_e32 v2, v11 964; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] 965; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] 966; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] 967; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc 968; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] 969; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] 970; GFX7-NEXT: s_setpc_b64 s[30:31] 971; 972; GFX8-LABEL: v_mul_i128: 973; GFX8: ; %bb.0: 974; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 975; GFX8-NEXT: v_mov_b32_e32 v8, v0 976; GFX8-NEXT: v_mov_b32_e32 v9, v1 977; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 978; GFX8-NEXT: v_mov_b32_e32 v10, v2 979; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7 980; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] 981; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 982; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] 983; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6 984; GFX8-NEXT: v_mov_b32_e32 v2, v11 985; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] 986; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] 987; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5] 988; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc 989; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] 990; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] 991; GFX8-NEXT: s_setpc_b64 s[30:31] 992; 993; GFX9-LABEL: v_mul_i128: 994; GFX9: ; %bb.0: 995; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 996; GFX9-NEXT: v_mov_b32_e32 v8, v0 997; GFX9-NEXT: v_mov_b32_e32 v9, v1 998; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0 999; GFX9-NEXT: v_mov_b32_e32 v10, v2 1000; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7 1001; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1] 1002; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0 1003; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12] 1004; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6 1005; GFX9-NEXT: v_mov_b32_e32 v2, v11 1006; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2] 1007; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2] 1008; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5] 1009; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc 1010; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7] 1011; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6] 1012; GFX9-NEXT: s_setpc_b64 s[30:31] 1013; 1014; GFX10-LABEL: v_mul_i128: 1015; GFX10: ; %bb.0: 1016; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1017; GFX10-NEXT: v_mov_b32_e32 v8, v0 1018; GFX10-NEXT: v_mov_b32_e32 v9, v1 1019; GFX10-NEXT: v_mov_b32_e32 v10, v2 1020; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0 1021; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7 1022; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6 1023; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1] 1024; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0 1025; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12] 1026; GFX10-NEXT: v_mov_b32_e32 v2, v11 1027; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] 1028; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2] 1029; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4 1030; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo 1031; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7] 1032; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6] 1033; GFX10-NEXT: s_setpc_b64 s[30:31] 1034; 1035; GFX11-LABEL: v_mul_i128: 1036; GFX11: ; %bb.0: 1037; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1038; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 1039; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4 1040; GFX11-NEXT: v_mov_b32_e32 v12, v3 1041; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0 1042; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6 1043; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7 1044; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1] 1045; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0 1046; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3] 1047; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] 1048; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2] 1049; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, v3, v6, s0 1050; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo 1051; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4] 1052; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7] 1053; GFX11-NEXT: s_setpc_b64 s[30:31] 1054; 1055; GFX12-LABEL: v_mul_i128: 1056; GFX12: ; %bb.0: 1057; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 1058; GFX12-NEXT: s_wait_expcnt 0x0 1059; GFX12-NEXT: s_wait_samplecnt 0x0 1060; GFX12-NEXT: s_wait_bvhcnt 0x0 1061; GFX12-NEXT: s_wait_kmcnt 0x0 1062; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 1063; GFX12-NEXT: v_mov_b32_e32 v10, v2 1064; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) 1065; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 1066; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7 1067; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6 1068; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1069; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1] 1070; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 1071; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12] 1072; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1073; GFX12-NEXT: v_mov_b32_e32 v2, v11 1074; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] 1075; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1076; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] 1077; GFX12-NEXT: v_add_co_ci_u32_e64 v7, s0, v12, v7, s0 1078; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1079; GFX12-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo 1080; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] 1081; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1082; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6] 1083; GFX12-NEXT: s_setpc_b64 s[30:31] 1084 %result = mul i128 %num, %den 1085 ret i128 %result 1086} 1087 1088define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { 1089; GFX7-LABEL: s_mul_i256: 1090; GFX7: ; %bb.0: 1091; GFX7-NEXT: s_mov_b32 s16, s0 1092; GFX7-NEXT: v_mov_b32_e32 v0, s8 1093; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 1094; GFX7-NEXT: v_mov_b32_e32 v1, s9 1095; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1 1096; GFX7-NEXT: v_mul_hi_u32 v1, s16, v1 1097; GFX7-NEXT: v_readfirstlane_b32 s17, v0 1098; GFX7-NEXT: v_mov_b32_e32 v0, s10 1099; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 1100; GFX7-NEXT: v_readfirstlane_b32 s21, v2 1101; GFX7-NEXT: v_mov_b32_e32 v2, s2 1102; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8 1103; GFX7-NEXT: s_mul_i32 s18, s16, s10 1104; GFX7-NEXT: s_mul_i32 s20, s1, s9 1105; GFX7-NEXT: v_readfirstlane_b32 s19, v0 1106; GFX7-NEXT: v_mov_b32_e32 v0, s1 1107; GFX7-NEXT: s_add_u32 s18, s20, s18 1108; GFX7-NEXT: s_addc_u32 s19, s21, s19 1109; GFX7-NEXT: s_mul_i32 s21, s2, s8 1110; GFX7-NEXT: v_readfirstlane_b32 s23, v1 1111; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8 1112; GFX7-NEXT: s_cselect_b32 s20, 1, 0 1113; GFX7-NEXT: v_readfirstlane_b32 s22, v3 1114; GFX7-NEXT: s_add_u32 s18, s21, s18 1115; GFX7-NEXT: s_addc_u32 s19, s22, s19 1116; GFX7-NEXT: s_mul_i32 s22, s16, s9 1117; GFX7-NEXT: s_cselect_b32 s21, 1, 0 1118; GFX7-NEXT: s_add_u32 s17, s22, s17 1119; GFX7-NEXT: s_addc_u32 s22, s23, s18 1120; GFX7-NEXT: v_readfirstlane_b32 s23, v1 1121; GFX7-NEXT: v_mov_b32_e32 v1, s12 1122; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1 1123; GFX7-NEXT: s_mul_i32 s18, s1, s8 1124; GFX7-NEXT: s_cselect_b32 s25, 1, 0 1125; GFX7-NEXT: s_add_u32 s18, s18, s17 1126; GFX7-NEXT: s_addc_u32 s17, s23, s22 1127; GFX7-NEXT: v_mov_b32_e32 v4, s11 1128; GFX7-NEXT: v_readfirstlane_b32 s23, v3 1129; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10 1130; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4 1131; GFX7-NEXT: s_mul_i32 s22, s16, s12 1132; GFX7-NEXT: s_mul_i32 s24, s1, s11 1133; GFX7-NEXT: v_readfirstlane_b32 s28, v3 1134; GFX7-NEXT: v_mov_b32_e32 v3, s3 1135; GFX7-NEXT: v_readfirstlane_b32 s27, v5 1136; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9 1137; GFX7-NEXT: s_cselect_b32 s26, 1, 0 1138; GFX7-NEXT: s_add_u32 s24, s24, s22 1139; GFX7-NEXT: s_addc_u32 s23, s27, s23 1140; GFX7-NEXT: v_readfirstlane_b32 s29, v5 1141; GFX7-NEXT: v_mov_b32_e32 v5, s4 1142; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8 1143; GFX7-NEXT: s_mul_i32 s27, s2, s10 1144; GFX7-NEXT: s_cselect_b32 s22, 1, 0 1145; GFX7-NEXT: s_add_u32 s24, s27, s24 1146; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10 1147; GFX7-NEXT: s_addc_u32 s27, s28, s23 1148; GFX7-NEXT: s_mul_i32 s28, s3, s9 1149; GFX7-NEXT: s_cselect_b32 s23, 1, 0 1150; GFX7-NEXT: s_add_u32 s28, s28, s24 1151; GFX7-NEXT: v_readfirstlane_b32 s30, v6 1152; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4 1153; GFX7-NEXT: s_addc_u32 s27, s29, s27 1154; GFX7-NEXT: s_mul_i32 s29, s4, s8 1155; GFX7-NEXT: s_cselect_b32 s24, 1, 0 1156; GFX7-NEXT: s_add_u32 s28, s29, s28 1157; GFX7-NEXT: v_readfirstlane_b32 s33, v0 1158; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9 1159; GFX7-NEXT: s_addc_u32 s27, s30, s27 1160; GFX7-NEXT: s_mul_i32 s30, s16, s11 1161; GFX7-NEXT: s_cselect_b32 s29, 1, 0 1162; GFX7-NEXT: v_readfirstlane_b32 s31, v6 1163; GFX7-NEXT: s_add_u32 s19, s30, s19 1164; GFX7-NEXT: s_addc_u32 s28, s31, s28 1165; GFX7-NEXT: s_mul_i32 s31, s1, s10 1166; GFX7-NEXT: s_cselect_b32 s30, 1, 0 1167; GFX7-NEXT: s_add_u32 s19, s31, s19 1168; GFX7-NEXT: v_readfirstlane_b32 s34, v0 1169; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8 1170; GFX7-NEXT: s_addc_u32 s28, s33, s28 1171; GFX7-NEXT: s_mul_i32 s33, s2, s9 1172; GFX7-NEXT: s_cselect_b32 s31, 1, 0 1173; GFX7-NEXT: s_add_u32 s19, s33, s19 1174; GFX7-NEXT: s_addc_u32 s28, s34, s28 1175; GFX7-NEXT: s_mul_i32 s34, s3, s8 1176; GFX7-NEXT: s_cselect_b32 s33, 1, 0 1177; GFX7-NEXT: v_readfirstlane_b32 s35, v0 1178; GFX7-NEXT: s_add_u32 s19, s34, s19 1179; GFX7-NEXT: v_mov_b32_e32 v0, s14 1180; GFX7-NEXT: s_addc_u32 s28, s35, s28 1181; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0 1182; GFX7-NEXT: s_cselect_b32 s34, 1, 0 1183; GFX7-NEXT: s_cmp_lg_u32 s26, 0 1184; GFX7-NEXT: s_addc_u32 s19, s25, s19 1185; GFX7-NEXT: v_mov_b32_e32 v2, s13 1186; GFX7-NEXT: s_cselect_b32 s25, 1, 0 1187; GFX7-NEXT: s_cmp_lg_u32 s21, 0 1188; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2 1189; GFX7-NEXT: s_addc_u32 s20, s20, 0 1190; GFX7-NEXT: v_readfirstlane_b32 s26, v0 1191; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1 1192; GFX7-NEXT: s_cmp_lg_u32 s25, 0 1193; GFX7-NEXT: s_addc_u32 s20, s20, s28 1194; GFX7-NEXT: s_mul_i32 s25, s16, s14 1195; GFX7-NEXT: s_mul_i32 s28, s1, s13 1196; GFX7-NEXT: s_cselect_b32 s21, 1, 0 1197; GFX7-NEXT: v_readfirstlane_b32 s35, v6 1198; GFX7-NEXT: s_add_u32 s25, s28, s25 1199; GFX7-NEXT: s_addc_u32 s26, s35, s26 1200; GFX7-NEXT: v_readfirstlane_b32 s35, v0 1201; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11 1202; GFX7-NEXT: s_mul_i32 s28, s2, s12 1203; GFX7-NEXT: s_add_u32 s25, s28, s25 1204; GFX7-NEXT: s_addc_u32 s26, s35, s26 1205; GFX7-NEXT: v_readfirstlane_b32 s35, v0 1206; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10 1207; GFX7-NEXT: s_mul_i32 s28, s3, s11 1208; GFX7-NEXT: s_add_u32 s25, s28, s25 1209; GFX7-NEXT: s_addc_u32 s26, s35, s26 1210; GFX7-NEXT: v_readfirstlane_b32 s35, v0 1211; GFX7-NEXT: v_mov_b32_e32 v0, s5 1212; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9 1213; GFX7-NEXT: s_mul_i32 s28, s4, s10 1214; GFX7-NEXT: s_add_u32 s25, s28, s25 1215; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1 1216; GFX7-NEXT: s_addc_u32 s26, s35, s26 1217; GFX7-NEXT: v_readfirstlane_b32 s35, v6 1218; GFX7-NEXT: v_mov_b32_e32 v6, s6 1219; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8 1220; GFX7-NEXT: s_mul_i32 s28, s5, s9 1221; GFX7-NEXT: s_add_u32 s25, s28, s25 1222; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2 1223; GFX7-NEXT: v_readfirstlane_b32 s36, v1 1224; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4 1225; GFX7-NEXT: s_addc_u32 s26, s35, s26 1226; GFX7-NEXT: s_mul_i32 s28, s6, s8 1227; GFX7-NEXT: v_readfirstlane_b32 s35, v6 1228; GFX7-NEXT: s_add_u32 s25, s28, s25 1229; GFX7-NEXT: s_addc_u32 s26, s35, s26 1230; GFX7-NEXT: s_mul_i32 s28, s16, s13 1231; GFX7-NEXT: v_readfirstlane_b32 s35, v2 1232; GFX7-NEXT: s_add_u32 s27, s28, s27 1233; GFX7-NEXT: v_readfirstlane_b32 s37, v1 1234; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10 1235; GFX7-NEXT: s_addc_u32 s25, s35, s25 1236; GFX7-NEXT: s_mul_i32 s35, s1, s12 1237; GFX7-NEXT: s_cselect_b32 s28, 1, 0 1238; GFX7-NEXT: s_add_u32 s27, s35, s27 1239; GFX7-NEXT: s_addc_u32 s25, s36, s25 1240; GFX7-NEXT: s_mul_i32 s36, s2, s11 1241; GFX7-NEXT: s_cselect_b32 s35, 1, 0 1242; GFX7-NEXT: s_add_u32 s27, s36, s27 1243; GFX7-NEXT: v_readfirstlane_b32 s38, v1 1244; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9 1245; GFX7-NEXT: s_addc_u32 s25, s37, s25 1246; GFX7-NEXT: s_mul_i32 s37, s3, s10 1247; GFX7-NEXT: s_cselect_b32 s36, 1, 0 1248; GFX7-NEXT: s_add_u32 s27, s37, s27 1249; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8 1250; GFX7-NEXT: s_addc_u32 s25, s38, s25 1251; GFX7-NEXT: s_mul_i32 s38, s4, s9 1252; GFX7-NEXT: s_cselect_b32 s37, 1, 0 1253; GFX7-NEXT: v_readfirstlane_b32 s39, v1 1254; GFX7-NEXT: s_add_u32 s27, s38, s27 1255; GFX7-NEXT: s_addc_u32 s25, s39, s25 1256; GFX7-NEXT: s_mul_i32 s39, s5, s8 1257; GFX7-NEXT: s_cselect_b32 s38, 1, 0 1258; GFX7-NEXT: v_readfirstlane_b32 s40, v0 1259; GFX7-NEXT: s_add_u32 s27, s39, s27 1260; GFX7-NEXT: s_addc_u32 s25, s40, s25 1261; GFX7-NEXT: s_cselect_b32 s39, 1, 0 1262; GFX7-NEXT: s_cmp_lg_u32 s31, 0 1263; GFX7-NEXT: s_addc_u32 s30, s30, 0 1264; GFX7-NEXT: s_cmp_lg_u32 s33, 0 1265; GFX7-NEXT: s_addc_u32 s30, s30, 0 1266; GFX7-NEXT: s_cmp_lg_u32 s34, 0 1267; GFX7-NEXT: s_addc_u32 s30, s30, 0 1268; GFX7-NEXT: s_cmp_lg_u32 s21, 0 1269; GFX7-NEXT: s_addc_u32 s21, s30, s27 1270; GFX7-NEXT: s_cselect_b32 s27, 1, 0 1271; GFX7-NEXT: s_cmp_lg_u32 s23, 0 1272; GFX7-NEXT: s_addc_u32 s22, s22, 0 1273; GFX7-NEXT: s_cmp_lg_u32 s24, 0 1274; GFX7-NEXT: s_addc_u32 s22, s22, 0 1275; GFX7-NEXT: s_cmp_lg_u32 s29, 0 1276; GFX7-NEXT: s_addc_u32 s22, s22, 0 1277; GFX7-NEXT: s_cmp_lg_u32 s27, 0 1278; GFX7-NEXT: s_addc_u32 s22, s22, s25 1279; GFX7-NEXT: s_mul_i32 s16, s16, s15 1280; GFX7-NEXT: s_addc_u32 s15, s26, s16 1281; GFX7-NEXT: s_mul_i32 s1, s1, s14 1282; GFX7-NEXT: s_cmp_lg_u32 s39, 0 1283; GFX7-NEXT: s_addc_u32 s1, s15, s1 1284; GFX7-NEXT: s_mul_i32 s2, s2, s13 1285; GFX7-NEXT: s_cmp_lg_u32 s38, 0 1286; GFX7-NEXT: s_addc_u32 s1, s1, s2 1287; GFX7-NEXT: s_mul_i32 s3, s3, s12 1288; GFX7-NEXT: s_cmp_lg_u32 s37, 0 1289; GFX7-NEXT: s_addc_u32 s1, s1, s3 1290; GFX7-NEXT: s_mul_i32 s4, s4, s11 1291; GFX7-NEXT: s_cmp_lg_u32 s36, 0 1292; GFX7-NEXT: s_addc_u32 s1, s1, s4 1293; GFX7-NEXT: s_mul_i32 s5, s5, s10 1294; GFX7-NEXT: s_cmp_lg_u32 s35, 0 1295; GFX7-NEXT: s_addc_u32 s1, s1, s5 1296; GFX7-NEXT: s_mul_i32 s6, s6, s9 1297; GFX7-NEXT: s_cmp_lg_u32 s28, 0 1298; GFX7-NEXT: s_addc_u32 s1, s1, s6 1299; GFX7-NEXT: s_mul_i32 s7, s7, s8 1300; GFX7-NEXT: s_mul_i32 s0, s0, s8 1301; GFX7-NEXT: s_add_u32 s7, s7, s1 1302; GFX7-NEXT: s_mov_b32 s1, s18 1303; GFX7-NEXT: s_mov_b32 s2, s17 1304; GFX7-NEXT: s_mov_b32 s3, s19 1305; GFX7-NEXT: s_mov_b32 s4, s20 1306; GFX7-NEXT: s_mov_b32 s5, s21 1307; GFX7-NEXT: s_mov_b32 s6, s22 1308; GFX7-NEXT: ; return to shader part epilog 1309; 1310; GFX8-LABEL: s_mul_i256: 1311; GFX8: ; %bb.0: 1312; GFX8-NEXT: s_mov_b32 s16, s0 1313; GFX8-NEXT: v_mov_b32_e32 v0, s8 1314; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 1315; GFX8-NEXT: v_mov_b32_e32 v1, s9 1316; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1 1317; GFX8-NEXT: v_mul_hi_u32 v1, s16, v1 1318; GFX8-NEXT: v_readfirstlane_b32 s17, v0 1319; GFX8-NEXT: v_mov_b32_e32 v0, s10 1320; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 1321; GFX8-NEXT: v_readfirstlane_b32 s21, v2 1322; GFX8-NEXT: v_mov_b32_e32 v2, s2 1323; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8 1324; GFX8-NEXT: s_mul_i32 s18, s16, s10 1325; GFX8-NEXT: s_mul_i32 s20, s1, s9 1326; GFX8-NEXT: v_readfirstlane_b32 s19, v0 1327; GFX8-NEXT: v_mov_b32_e32 v0, s1 1328; GFX8-NEXT: s_add_u32 s18, s20, s18 1329; GFX8-NEXT: s_addc_u32 s19, s21, s19 1330; GFX8-NEXT: s_mul_i32 s21, s2, s8 1331; GFX8-NEXT: v_readfirstlane_b32 s23, v1 1332; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8 1333; GFX8-NEXT: s_cselect_b32 s20, 1, 0 1334; GFX8-NEXT: v_readfirstlane_b32 s22, v3 1335; GFX8-NEXT: s_add_u32 s18, s21, s18 1336; GFX8-NEXT: s_addc_u32 s19, s22, s19 1337; GFX8-NEXT: s_mul_i32 s22, s16, s9 1338; GFX8-NEXT: s_cselect_b32 s21, 1, 0 1339; GFX8-NEXT: s_add_u32 s17, s22, s17 1340; GFX8-NEXT: s_addc_u32 s22, s23, s18 1341; GFX8-NEXT: v_readfirstlane_b32 s23, v1 1342; GFX8-NEXT: v_mov_b32_e32 v1, s12 1343; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1 1344; GFX8-NEXT: s_mul_i32 s18, s1, s8 1345; GFX8-NEXT: s_cselect_b32 s25, 1, 0 1346; GFX8-NEXT: s_add_u32 s18, s18, s17 1347; GFX8-NEXT: s_addc_u32 s17, s23, s22 1348; GFX8-NEXT: v_mov_b32_e32 v4, s11 1349; GFX8-NEXT: v_readfirstlane_b32 s23, v3 1350; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10 1351; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4 1352; GFX8-NEXT: s_mul_i32 s22, s16, s12 1353; GFX8-NEXT: s_mul_i32 s24, s1, s11 1354; GFX8-NEXT: v_readfirstlane_b32 s28, v3 1355; GFX8-NEXT: v_mov_b32_e32 v3, s3 1356; GFX8-NEXT: v_readfirstlane_b32 s27, v5 1357; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9 1358; GFX8-NEXT: s_cselect_b32 s26, 1, 0 1359; GFX8-NEXT: s_add_u32 s24, s24, s22 1360; GFX8-NEXT: s_addc_u32 s23, s27, s23 1361; GFX8-NEXT: v_readfirstlane_b32 s29, v5 1362; GFX8-NEXT: v_mov_b32_e32 v5, s4 1363; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8 1364; GFX8-NEXT: s_mul_i32 s27, s2, s10 1365; GFX8-NEXT: s_cselect_b32 s22, 1, 0 1366; GFX8-NEXT: s_add_u32 s24, s27, s24 1367; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10 1368; GFX8-NEXT: s_addc_u32 s27, s28, s23 1369; GFX8-NEXT: s_mul_i32 s28, s3, s9 1370; GFX8-NEXT: s_cselect_b32 s23, 1, 0 1371; GFX8-NEXT: s_add_u32 s28, s28, s24 1372; GFX8-NEXT: v_readfirstlane_b32 s30, v6 1373; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4 1374; GFX8-NEXT: s_addc_u32 s27, s29, s27 1375; GFX8-NEXT: s_mul_i32 s29, s4, s8 1376; GFX8-NEXT: s_cselect_b32 s24, 1, 0 1377; GFX8-NEXT: s_add_u32 s28, s29, s28 1378; GFX8-NEXT: v_readfirstlane_b32 s33, v0 1379; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9 1380; GFX8-NEXT: s_addc_u32 s27, s30, s27 1381; GFX8-NEXT: s_mul_i32 s30, s16, s11 1382; GFX8-NEXT: s_cselect_b32 s29, 1, 0 1383; GFX8-NEXT: v_readfirstlane_b32 s31, v6 1384; GFX8-NEXT: s_add_u32 s19, s30, s19 1385; GFX8-NEXT: s_addc_u32 s28, s31, s28 1386; GFX8-NEXT: s_mul_i32 s31, s1, s10 1387; GFX8-NEXT: s_cselect_b32 s30, 1, 0 1388; GFX8-NEXT: s_add_u32 s19, s31, s19 1389; GFX8-NEXT: v_readfirstlane_b32 s34, v0 1390; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8 1391; GFX8-NEXT: s_addc_u32 s28, s33, s28 1392; GFX8-NEXT: s_mul_i32 s33, s2, s9 1393; GFX8-NEXT: s_cselect_b32 s31, 1, 0 1394; GFX8-NEXT: s_add_u32 s19, s33, s19 1395; GFX8-NEXT: s_addc_u32 s28, s34, s28 1396; GFX8-NEXT: s_mul_i32 s34, s3, s8 1397; GFX8-NEXT: s_cselect_b32 s33, 1, 0 1398; GFX8-NEXT: v_readfirstlane_b32 s35, v0 1399; GFX8-NEXT: s_add_u32 s19, s34, s19 1400; GFX8-NEXT: v_mov_b32_e32 v0, s14 1401; GFX8-NEXT: s_addc_u32 s28, s35, s28 1402; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0 1403; GFX8-NEXT: s_cselect_b32 s34, 1, 0 1404; GFX8-NEXT: s_cmp_lg_u32 s26, 0 1405; GFX8-NEXT: s_addc_u32 s19, s25, s19 1406; GFX8-NEXT: v_mov_b32_e32 v2, s13 1407; GFX8-NEXT: s_cselect_b32 s25, 1, 0 1408; GFX8-NEXT: s_cmp_lg_u32 s21, 0 1409; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2 1410; GFX8-NEXT: s_addc_u32 s20, s20, 0 1411; GFX8-NEXT: v_readfirstlane_b32 s26, v0 1412; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1 1413; GFX8-NEXT: s_cmp_lg_u32 s25, 0 1414; GFX8-NEXT: s_addc_u32 s20, s20, s28 1415; GFX8-NEXT: s_mul_i32 s25, s16, s14 1416; GFX8-NEXT: s_mul_i32 s28, s1, s13 1417; GFX8-NEXT: s_cselect_b32 s21, 1, 0 1418; GFX8-NEXT: v_readfirstlane_b32 s35, v6 1419; GFX8-NEXT: s_add_u32 s25, s28, s25 1420; GFX8-NEXT: s_addc_u32 s26, s35, s26 1421; GFX8-NEXT: v_readfirstlane_b32 s35, v0 1422; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11 1423; GFX8-NEXT: s_mul_i32 s28, s2, s12 1424; GFX8-NEXT: s_add_u32 s25, s28, s25 1425; GFX8-NEXT: s_addc_u32 s26, s35, s26 1426; GFX8-NEXT: v_readfirstlane_b32 s35, v0 1427; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10 1428; GFX8-NEXT: s_mul_i32 s28, s3, s11 1429; GFX8-NEXT: s_add_u32 s25, s28, s25 1430; GFX8-NEXT: s_addc_u32 s26, s35, s26 1431; GFX8-NEXT: v_readfirstlane_b32 s35, v0 1432; GFX8-NEXT: v_mov_b32_e32 v0, s5 1433; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9 1434; GFX8-NEXT: s_mul_i32 s28, s4, s10 1435; GFX8-NEXT: s_add_u32 s25, s28, s25 1436; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 1437; GFX8-NEXT: s_addc_u32 s26, s35, s26 1438; GFX8-NEXT: v_readfirstlane_b32 s35, v6 1439; GFX8-NEXT: v_mov_b32_e32 v6, s6 1440; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8 1441; GFX8-NEXT: s_mul_i32 s28, s5, s9 1442; GFX8-NEXT: s_add_u32 s25, s28, s25 1443; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2 1444; GFX8-NEXT: v_readfirstlane_b32 s36, v1 1445; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4 1446; GFX8-NEXT: s_addc_u32 s26, s35, s26 1447; GFX8-NEXT: s_mul_i32 s28, s6, s8 1448; GFX8-NEXT: v_readfirstlane_b32 s35, v6 1449; GFX8-NEXT: s_add_u32 s25, s28, s25 1450; GFX8-NEXT: s_addc_u32 s26, s35, s26 1451; GFX8-NEXT: s_mul_i32 s28, s16, s13 1452; GFX8-NEXT: v_readfirstlane_b32 s35, v2 1453; GFX8-NEXT: s_add_u32 s27, s28, s27 1454; GFX8-NEXT: v_readfirstlane_b32 s37, v1 1455; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10 1456; GFX8-NEXT: s_addc_u32 s25, s35, s25 1457; GFX8-NEXT: s_mul_i32 s35, s1, s12 1458; GFX8-NEXT: s_cselect_b32 s28, 1, 0 1459; GFX8-NEXT: s_add_u32 s27, s35, s27 1460; GFX8-NEXT: s_addc_u32 s25, s36, s25 1461; GFX8-NEXT: s_mul_i32 s36, s2, s11 1462; GFX8-NEXT: s_cselect_b32 s35, 1, 0 1463; GFX8-NEXT: s_add_u32 s27, s36, s27 1464; GFX8-NEXT: v_readfirstlane_b32 s38, v1 1465; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9 1466; GFX8-NEXT: s_addc_u32 s25, s37, s25 1467; GFX8-NEXT: s_mul_i32 s37, s3, s10 1468; GFX8-NEXT: s_cselect_b32 s36, 1, 0 1469; GFX8-NEXT: s_add_u32 s27, s37, s27 1470; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8 1471; GFX8-NEXT: s_addc_u32 s25, s38, s25 1472; GFX8-NEXT: s_mul_i32 s38, s4, s9 1473; GFX8-NEXT: s_cselect_b32 s37, 1, 0 1474; GFX8-NEXT: v_readfirstlane_b32 s39, v1 1475; GFX8-NEXT: s_add_u32 s27, s38, s27 1476; GFX8-NEXT: s_addc_u32 s25, s39, s25 1477; GFX8-NEXT: s_mul_i32 s39, s5, s8 1478; GFX8-NEXT: s_cselect_b32 s38, 1, 0 1479; GFX8-NEXT: v_readfirstlane_b32 s40, v0 1480; GFX8-NEXT: s_add_u32 s27, s39, s27 1481; GFX8-NEXT: s_addc_u32 s25, s40, s25 1482; GFX8-NEXT: s_cselect_b32 s39, 1, 0 1483; GFX8-NEXT: s_cmp_lg_u32 s31, 0 1484; GFX8-NEXT: s_addc_u32 s30, s30, 0 1485; GFX8-NEXT: s_cmp_lg_u32 s33, 0 1486; GFX8-NEXT: s_addc_u32 s30, s30, 0 1487; GFX8-NEXT: s_cmp_lg_u32 s34, 0 1488; GFX8-NEXT: s_addc_u32 s30, s30, 0 1489; GFX8-NEXT: s_cmp_lg_u32 s21, 0 1490; GFX8-NEXT: s_addc_u32 s21, s30, s27 1491; GFX8-NEXT: s_cselect_b32 s27, 1, 0 1492; GFX8-NEXT: s_cmp_lg_u32 s23, 0 1493; GFX8-NEXT: s_addc_u32 s22, s22, 0 1494; GFX8-NEXT: s_cmp_lg_u32 s24, 0 1495; GFX8-NEXT: s_addc_u32 s22, s22, 0 1496; GFX8-NEXT: s_cmp_lg_u32 s29, 0 1497; GFX8-NEXT: s_addc_u32 s22, s22, 0 1498; GFX8-NEXT: s_cmp_lg_u32 s27, 0 1499; GFX8-NEXT: s_addc_u32 s22, s22, s25 1500; GFX8-NEXT: s_mul_i32 s16, s16, s15 1501; GFX8-NEXT: s_addc_u32 s15, s26, s16 1502; GFX8-NEXT: s_mul_i32 s1, s1, s14 1503; GFX8-NEXT: s_cmp_lg_u32 s39, 0 1504; GFX8-NEXT: s_addc_u32 s1, s15, s1 1505; GFX8-NEXT: s_mul_i32 s2, s2, s13 1506; GFX8-NEXT: s_cmp_lg_u32 s38, 0 1507; GFX8-NEXT: s_addc_u32 s1, s1, s2 1508; GFX8-NEXT: s_mul_i32 s3, s3, s12 1509; GFX8-NEXT: s_cmp_lg_u32 s37, 0 1510; GFX8-NEXT: s_addc_u32 s1, s1, s3 1511; GFX8-NEXT: s_mul_i32 s4, s4, s11 1512; GFX8-NEXT: s_cmp_lg_u32 s36, 0 1513; GFX8-NEXT: s_addc_u32 s1, s1, s4 1514; GFX8-NEXT: s_mul_i32 s5, s5, s10 1515; GFX8-NEXT: s_cmp_lg_u32 s35, 0 1516; GFX8-NEXT: s_addc_u32 s1, s1, s5 1517; GFX8-NEXT: s_mul_i32 s6, s6, s9 1518; GFX8-NEXT: s_cmp_lg_u32 s28, 0 1519; GFX8-NEXT: s_addc_u32 s1, s1, s6 1520; GFX8-NEXT: s_mul_i32 s7, s7, s8 1521; GFX8-NEXT: s_mul_i32 s0, s0, s8 1522; GFX8-NEXT: s_add_u32 s7, s7, s1 1523; GFX8-NEXT: s_mov_b32 s1, s18 1524; GFX8-NEXT: s_mov_b32 s2, s17 1525; GFX8-NEXT: s_mov_b32 s3, s19 1526; GFX8-NEXT: s_mov_b32 s4, s20 1527; GFX8-NEXT: s_mov_b32 s5, s21 1528; GFX8-NEXT: s_mov_b32 s6, s22 1529; GFX8-NEXT: ; return to shader part epilog 1530; 1531; GFX9-LABEL: s_mul_i256: 1532; GFX9: ; %bb.0: 1533; GFX9-NEXT: s_mov_b32 s16, s0 1534; GFX9-NEXT: s_mul_i32 s18, s16, s10 1535; GFX9-NEXT: s_mul_i32 s20, s1, s9 1536; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10 1537; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9 1538; GFX9-NEXT: s_add_u32 s18, s20, s18 1539; GFX9-NEXT: s_addc_u32 s19, s21, s19 1540; GFX9-NEXT: s_mul_i32 s21, s2, s8 1541; GFX9-NEXT: s_cselect_b32 s20, 1, 0 1542; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8 1543; GFX9-NEXT: s_add_u32 s18, s21, s18 1544; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8 1545; GFX9-NEXT: s_addc_u32 s19, s22, s19 1546; GFX9-NEXT: s_mul_i32 s22, s16, s9 1547; GFX9-NEXT: s_cselect_b32 s21, 1, 0 1548; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9 1549; GFX9-NEXT: s_add_u32 s17, s22, s17 1550; GFX9-NEXT: s_addc_u32 s18, s23, s18 1551; GFX9-NEXT: s_mul_i32 s23, s1, s8 1552; GFX9-NEXT: s_cselect_b32 s22, 1, 0 1553; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8 1554; GFX9-NEXT: s_add_u32 s17, s23, s17 1555; GFX9-NEXT: s_addc_u32 s18, s24, s18 1556; GFX9-NEXT: s_mul_i32 s24, s16, s12 1557; GFX9-NEXT: s_mul_i32 s26, s1, s11 1558; GFX9-NEXT: s_cselect_b32 s23, 1, 0 1559; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12 1560; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11 1561; GFX9-NEXT: s_add_u32 s24, s26, s24 1562; GFX9-NEXT: s_addc_u32 s25, s27, s25 1563; GFX9-NEXT: s_mul_i32 s27, s2, s10 1564; GFX9-NEXT: s_cselect_b32 s26, 1, 0 1565; GFX9-NEXT: s_mul_hi_u32 s28, s2, s10 1566; GFX9-NEXT: s_add_u32 s24, s27, s24 1567; GFX9-NEXT: s_addc_u32 s25, s28, s25 1568; GFX9-NEXT: s_mul_i32 s28, s3, s9 1569; GFX9-NEXT: s_cselect_b32 s27, 1, 0 1570; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9 1571; GFX9-NEXT: s_add_u32 s24, s28, s24 1572; GFX9-NEXT: s_addc_u32 s25, s29, s25 1573; GFX9-NEXT: s_mul_i32 s29, s4, s8 1574; GFX9-NEXT: s_cselect_b32 s28, 1, 0 1575; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8 1576; GFX9-NEXT: s_add_u32 s24, s29, s24 1577; GFX9-NEXT: s_addc_u32 s25, s30, s25 1578; GFX9-NEXT: s_mul_i32 s30, s16, s11 1579; GFX9-NEXT: s_cselect_b32 s29, 1, 0 1580; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11 1581; GFX9-NEXT: s_add_u32 s19, s30, s19 1582; GFX9-NEXT: s_addc_u32 s24, s31, s24 1583; GFX9-NEXT: s_mul_i32 s31, s1, s10 1584; GFX9-NEXT: s_cselect_b32 s30, 1, 0 1585; GFX9-NEXT: s_mul_hi_u32 s33, s1, s10 1586; GFX9-NEXT: s_add_u32 s19, s31, s19 1587; GFX9-NEXT: s_addc_u32 s24, s33, s24 1588; GFX9-NEXT: s_mul_i32 s33, s2, s9 1589; GFX9-NEXT: s_cselect_b32 s31, 1, 0 1590; GFX9-NEXT: s_mul_hi_u32 s34, s2, s9 1591; GFX9-NEXT: s_add_u32 s19, s33, s19 1592; GFX9-NEXT: s_addc_u32 s24, s34, s24 1593; GFX9-NEXT: s_mul_i32 s34, s3, s8 1594; GFX9-NEXT: s_cselect_b32 s33, 1, 0 1595; GFX9-NEXT: s_mul_hi_u32 s35, s3, s8 1596; GFX9-NEXT: s_add_u32 s19, s34, s19 1597; GFX9-NEXT: s_addc_u32 s24, s35, s24 1598; GFX9-NEXT: s_cselect_b32 s34, 1, 0 1599; GFX9-NEXT: s_cmp_lg_u32 s23, 0 1600; GFX9-NEXT: s_addc_u32 s19, s22, s19 1601; GFX9-NEXT: s_cselect_b32 s22, 1, 0 1602; GFX9-NEXT: s_cmp_lg_u32 s21, 0 1603; GFX9-NEXT: s_addc_u32 s20, s20, 0 1604; GFX9-NEXT: s_cmp_lg_u32 s22, 0 1605; GFX9-NEXT: s_addc_u32 s20, s20, s24 1606; GFX9-NEXT: s_mul_i32 s22, s16, s14 1607; GFX9-NEXT: s_mul_i32 s24, s1, s13 1608; GFX9-NEXT: s_cselect_b32 s21, 1, 0 1609; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14 1610; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13 1611; GFX9-NEXT: s_add_u32 s22, s24, s22 1612; GFX9-NEXT: s_addc_u32 s23, s35, s23 1613; GFX9-NEXT: s_mul_i32 s24, s2, s12 1614; GFX9-NEXT: s_mul_hi_u32 s35, s2, s12 1615; GFX9-NEXT: s_add_u32 s22, s24, s22 1616; GFX9-NEXT: s_addc_u32 s23, s35, s23 1617; GFX9-NEXT: s_mul_i32 s24, s3, s11 1618; GFX9-NEXT: s_mul_hi_u32 s35, s3, s11 1619; GFX9-NEXT: s_add_u32 s22, s24, s22 1620; GFX9-NEXT: s_addc_u32 s23, s35, s23 1621; GFX9-NEXT: s_mul_i32 s24, s4, s10 1622; GFX9-NEXT: s_mul_hi_u32 s35, s4, s10 1623; GFX9-NEXT: s_add_u32 s22, s24, s22 1624; GFX9-NEXT: s_addc_u32 s23, s35, s23 1625; GFX9-NEXT: s_mul_i32 s24, s5, s9 1626; GFX9-NEXT: s_mul_hi_u32 s35, s5, s9 1627; GFX9-NEXT: s_add_u32 s22, s24, s22 1628; GFX9-NEXT: s_addc_u32 s23, s35, s23 1629; GFX9-NEXT: s_mul_i32 s24, s6, s8 1630; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8 1631; GFX9-NEXT: s_add_u32 s22, s24, s22 1632; GFX9-NEXT: s_addc_u32 s23, s35, s23 1633; GFX9-NEXT: s_mul_i32 s24, s16, s13 1634; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13 1635; GFX9-NEXT: s_add_u32 s24, s24, s25 1636; GFX9-NEXT: s_addc_u32 s22, s35, s22 1637; GFX9-NEXT: s_mul_i32 s35, s1, s12 1638; GFX9-NEXT: s_cselect_b32 s25, 1, 0 1639; GFX9-NEXT: s_mul_hi_u32 s36, s1, s12 1640; GFX9-NEXT: s_add_u32 s24, s35, s24 1641; GFX9-NEXT: s_addc_u32 s22, s36, s22 1642; GFX9-NEXT: s_mul_i32 s36, s2, s11 1643; GFX9-NEXT: s_cselect_b32 s35, 1, 0 1644; GFX9-NEXT: s_mul_hi_u32 s37, s2, s11 1645; GFX9-NEXT: s_add_u32 s24, s36, s24 1646; GFX9-NEXT: s_addc_u32 s22, s37, s22 1647; GFX9-NEXT: s_mul_i32 s37, s3, s10 1648; GFX9-NEXT: s_cselect_b32 s36, 1, 0 1649; GFX9-NEXT: s_mul_hi_u32 s38, s3, s10 1650; GFX9-NEXT: s_add_u32 s24, s37, s24 1651; GFX9-NEXT: s_addc_u32 s22, s38, s22 1652; GFX9-NEXT: s_mul_i32 s38, s4, s9 1653; GFX9-NEXT: s_cselect_b32 s37, 1, 0 1654; GFX9-NEXT: s_mul_hi_u32 s39, s4, s9 1655; GFX9-NEXT: s_add_u32 s24, s38, s24 1656; GFX9-NEXT: s_addc_u32 s22, s39, s22 1657; GFX9-NEXT: s_mul_i32 s39, s5, s8 1658; GFX9-NEXT: s_cselect_b32 s38, 1, 0 1659; GFX9-NEXT: s_mul_hi_u32 s40, s5, s8 1660; GFX9-NEXT: s_add_u32 s24, s39, s24 1661; GFX9-NEXT: s_addc_u32 s22, s40, s22 1662; GFX9-NEXT: s_cselect_b32 s39, 1, 0 1663; GFX9-NEXT: s_cmp_lg_u32 s31, 0 1664; GFX9-NEXT: s_addc_u32 s30, s30, 0 1665; GFX9-NEXT: s_cmp_lg_u32 s33, 0 1666; GFX9-NEXT: s_addc_u32 s30, s30, 0 1667; GFX9-NEXT: s_cmp_lg_u32 s34, 0 1668; GFX9-NEXT: s_addc_u32 s30, s30, 0 1669; GFX9-NEXT: s_cmp_lg_u32 s21, 0 1670; GFX9-NEXT: s_addc_u32 s21, s30, s24 1671; GFX9-NEXT: s_cselect_b32 s24, 1, 0 1672; GFX9-NEXT: s_cmp_lg_u32 s27, 0 1673; GFX9-NEXT: s_addc_u32 s26, s26, 0 1674; GFX9-NEXT: s_cmp_lg_u32 s28, 0 1675; GFX9-NEXT: s_addc_u32 s26, s26, 0 1676; GFX9-NEXT: s_cmp_lg_u32 s29, 0 1677; GFX9-NEXT: s_addc_u32 s26, s26, 0 1678; GFX9-NEXT: s_cmp_lg_u32 s24, 0 1679; GFX9-NEXT: s_addc_u32 s22, s26, s22 1680; GFX9-NEXT: s_mul_i32 s16, s16, s15 1681; GFX9-NEXT: s_addc_u32 s15, s23, s16 1682; GFX9-NEXT: s_mul_i32 s1, s1, s14 1683; GFX9-NEXT: s_cmp_lg_u32 s39, 0 1684; GFX9-NEXT: s_addc_u32 s1, s15, s1 1685; GFX9-NEXT: s_mul_i32 s2, s2, s13 1686; GFX9-NEXT: s_cmp_lg_u32 s38, 0 1687; GFX9-NEXT: s_addc_u32 s1, s1, s2 1688; GFX9-NEXT: s_mul_i32 s3, s3, s12 1689; GFX9-NEXT: s_cmp_lg_u32 s37, 0 1690; GFX9-NEXT: s_addc_u32 s1, s1, s3 1691; GFX9-NEXT: s_mul_i32 s4, s4, s11 1692; GFX9-NEXT: s_cmp_lg_u32 s36, 0 1693; GFX9-NEXT: s_addc_u32 s1, s1, s4 1694; GFX9-NEXT: s_mul_i32 s5, s5, s10 1695; GFX9-NEXT: s_cmp_lg_u32 s35, 0 1696; GFX9-NEXT: s_addc_u32 s1, s1, s5 1697; GFX9-NEXT: s_mul_i32 s6, s6, s9 1698; GFX9-NEXT: s_cmp_lg_u32 s25, 0 1699; GFX9-NEXT: s_addc_u32 s1, s1, s6 1700; GFX9-NEXT: s_mul_i32 s7, s7, s8 1701; GFX9-NEXT: s_mul_i32 s0, s0, s8 1702; GFX9-NEXT: s_add_u32 s7, s7, s1 1703; GFX9-NEXT: s_mov_b32 s1, s17 1704; GFX9-NEXT: s_mov_b32 s2, s18 1705; GFX9-NEXT: s_mov_b32 s3, s19 1706; GFX9-NEXT: s_mov_b32 s4, s20 1707; GFX9-NEXT: s_mov_b32 s5, s21 1708; GFX9-NEXT: s_mov_b32 s6, s22 1709; GFX9-NEXT: ; return to shader part epilog 1710; 1711; GFX10PLUS-LABEL: s_mul_i256: 1712; GFX10PLUS: ; %bb.0: 1713; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10 1714; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9 1715; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10 1716; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9 1717; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17 1718; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18 1719; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8 1720; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8 1721; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0 1722; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17 1723; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8 1724; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 1725; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9 1726; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9 1727; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 1728; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16 1729; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17 1730; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8 1731; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8 1732; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 1733; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16 1734; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17 1735; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12 1736; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11 1737; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12 1738; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11 1739; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0 1740; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23 1741; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24 1742; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10 1743; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10 1744; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0 1745; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23 1746; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24 1747; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9 1748; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9 1749; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0 1750; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23 1751; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24 1752; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8 1753; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8 1754; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0 1755; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23 1756; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24 1757; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11 1758; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11 1759; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0 1760; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18 1761; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23 1762; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10 1763; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10 1764; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0 1765; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18 1766; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23 1767; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9 1768; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9 1769; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0 1770; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18 1771; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23 1772; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8 1773; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8 1774; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0 1775; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18 1776; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23 1777; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0 1778; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0 1779; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14 1780; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18 1781; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0 1782; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 1783; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13 1784; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0 1785; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0 1786; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14 1787; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23 1788; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13 1789; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 1790; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1791; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12 1792; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1793; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12 1794; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1795; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11 1796; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1797; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11 1798; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1799; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10 1800; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1801; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10 1802; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1803; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9 1804; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1805; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9 1806; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1807; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8 1808; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1809; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8 1810; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21 1811; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13 1812; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22 1813; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13 1814; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24 1815; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21 1816; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12 1817; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12 1818; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0 1819; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23 1820; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21 1821; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11 1822; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11 1823; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0 1824; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23 1825; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21 1826; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10 1827; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10 1828; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0 1829; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23 1830; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21 1831; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9 1832; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9 1833; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0 1834; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23 1835; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21 1836; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8 1837; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8 1838; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0 1839; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23 1840; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21 1841; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0 1842; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0 1843; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14 1844; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 1845; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0 1846; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13 1847; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 1848; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0 1849; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12 1850; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0 1851; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 1852; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11 1853; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23 1854; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0 1855; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0 1856; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15 1857; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 1858; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0 1859; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10 1860; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 1861; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0 1862; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9 1863; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0 1864; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0 1865; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8 1866; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21 1867; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26 1868; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0 1869; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8 1870; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1 1871; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0 1872; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2 1873; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0 1874; GFX10PLUS-NEXT: s_mov_b32 s2, s17 1875; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3 1876; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0 1877; GFX10PLUS-NEXT: s_mov_b32 s3, s18 1878; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4 1879; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0 1880; GFX10PLUS-NEXT: s_mov_b32 s4, s19 1881; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5 1882; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0 1883; GFX10PLUS-NEXT: s_mov_b32 s5, s20 1884; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6 1885; GFX10PLUS-NEXT: s_mov_b32 s6, s15 1886; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7 1887; GFX10PLUS-NEXT: s_mov_b32 s1, s16 1888; GFX10PLUS-NEXT: ; return to shader part epilog 1889; 1890; GFX12-LABEL: s_mul_i256: 1891; GFX12: ; %bb.0: 1892; GFX12-NEXT: s_mul_i32 s17, s0, s10 1893; GFX12-NEXT: s_mul_i32 s19, s1, s9 1894; GFX12-NEXT: s_mul_hi_u32 s18, s0, s10 1895; GFX12-NEXT: s_mul_hi_u32 s20, s1, s9 1896; GFX12-NEXT: s_add_co_u32 s17, s19, s17 1897; GFX12-NEXT: s_add_co_ci_u32 s18, s20, s18 1898; GFX12-NEXT: s_mul_i32 s20, s2, s8 1899; GFX12-NEXT: s_mul_hi_u32 s21, s2, s8 1900; GFX12-NEXT: s_cselect_b32 s19, 1, 0 1901; GFX12-NEXT: s_add_co_u32 s17, s20, s17 1902; GFX12-NEXT: s_mul_hi_u32 s16, s0, s8 1903; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 1904; GFX12-NEXT: s_mul_i32 s21, s0, s9 1905; GFX12-NEXT: s_mul_hi_u32 s22, s0, s9 1906; GFX12-NEXT: s_cselect_b32 s20, 1, 0 1907; GFX12-NEXT: s_add_co_u32 s16, s21, s16 1908; GFX12-NEXT: s_add_co_ci_u32 s17, s22, s17 1909; GFX12-NEXT: s_mul_i32 s22, s1, s8 1910; GFX12-NEXT: s_mul_hi_u32 s23, s1, s8 1911; GFX12-NEXT: s_cselect_b32 s21, 1, 0 1912; GFX12-NEXT: s_add_co_u32 s16, s22, s16 1913; GFX12-NEXT: s_add_co_ci_u32 s17, s23, s17 1914; GFX12-NEXT: s_mul_i32 s23, s0, s12 1915; GFX12-NEXT: s_mul_i32 s25, s1, s11 1916; GFX12-NEXT: s_mul_hi_u32 s24, s0, s12 1917; GFX12-NEXT: s_mul_hi_u32 s26, s1, s11 1918; GFX12-NEXT: s_cselect_b32 s22, 1, 0 1919; GFX12-NEXT: s_add_co_u32 s23, s25, s23 1920; GFX12-NEXT: s_add_co_ci_u32 s24, s26, s24 1921; GFX12-NEXT: s_mul_i32 s26, s2, s10 1922; GFX12-NEXT: s_mul_hi_u32 s27, s2, s10 1923; GFX12-NEXT: s_cselect_b32 s25, 1, 0 1924; GFX12-NEXT: s_add_co_u32 s23, s26, s23 1925; GFX12-NEXT: s_add_co_ci_u32 s24, s27, s24 1926; GFX12-NEXT: s_mul_i32 s27, s3, s9 1927; GFX12-NEXT: s_mul_hi_u32 s28, s3, s9 1928; GFX12-NEXT: s_cselect_b32 s26, 1, 0 1929; GFX12-NEXT: s_add_co_u32 s23, s27, s23 1930; GFX12-NEXT: s_add_co_ci_u32 s24, s28, s24 1931; GFX12-NEXT: s_mul_i32 s28, s4, s8 1932; GFX12-NEXT: s_mul_hi_u32 s29, s4, s8 1933; GFX12-NEXT: s_cselect_b32 s27, 1, 0 1934; GFX12-NEXT: s_add_co_u32 s23, s28, s23 1935; GFX12-NEXT: s_add_co_ci_u32 s24, s29, s24 1936; GFX12-NEXT: s_mul_i32 s29, s0, s11 1937; GFX12-NEXT: s_mul_hi_u32 s30, s0, s11 1938; GFX12-NEXT: s_cselect_b32 s28, 1, 0 1939; GFX12-NEXT: s_add_co_u32 s18, s29, s18 1940; GFX12-NEXT: s_add_co_ci_u32 s23, s30, s23 1941; GFX12-NEXT: s_mul_i32 s30, s1, s10 1942; GFX12-NEXT: s_mul_hi_u32 s31, s1, s10 1943; GFX12-NEXT: s_cselect_b32 s29, 1, 0 1944; GFX12-NEXT: s_add_co_u32 s18, s30, s18 1945; GFX12-NEXT: s_add_co_ci_u32 s23, s31, s23 1946; GFX12-NEXT: s_mul_i32 s31, s2, s9 1947; GFX12-NEXT: s_mul_hi_u32 s33, s2, s9 1948; GFX12-NEXT: s_cselect_b32 s30, 1, 0 1949; GFX12-NEXT: s_add_co_u32 s18, s31, s18 1950; GFX12-NEXT: s_add_co_ci_u32 s23, s33, s23 1951; GFX12-NEXT: s_mul_i32 s33, s3, s8 1952; GFX12-NEXT: s_mul_hi_u32 s34, s3, s8 1953; GFX12-NEXT: s_cselect_b32 s31, 1, 0 1954; GFX12-NEXT: s_add_co_u32 s18, s33, s18 1955; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23 1956; GFX12-NEXT: s_cselect_b32 s33, 1, 0 1957; GFX12-NEXT: s_cmp_lg_u32 s22, 0 1958; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14 1959; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18 1960; GFX12-NEXT: s_cselect_b32 s21, 1, 0 1961; GFX12-NEXT: s_cmp_lg_u32 s20, 0 1962; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13 1963; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0 1964; GFX12-NEXT: s_cmp_lg_u32 s21, 0 1965; GFX12-NEXT: s_mul_i32 s21, s0, s14 1966; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23 1967; GFX12-NEXT: s_mul_i32 s23, s1, s13 1968; GFX12-NEXT: s_cselect_b32 s20, 1, 0 1969; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1970; GFX12-NEXT: s_mul_i32 s23, s2, s12 1971; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1972; GFX12-NEXT: s_mul_hi_u32 s34, s2, s12 1973; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1974; GFX12-NEXT: s_mul_i32 s23, s3, s11 1975; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1976; GFX12-NEXT: s_mul_hi_u32 s34, s3, s11 1977; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1978; GFX12-NEXT: s_mul_i32 s23, s4, s10 1979; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1980; GFX12-NEXT: s_mul_hi_u32 s34, s4, s10 1981; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1982; GFX12-NEXT: s_mul_i32 s23, s5, s9 1983; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1984; GFX12-NEXT: s_mul_hi_u32 s34, s5, s9 1985; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1986; GFX12-NEXT: s_mul_i32 s23, s6, s8 1987; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1988; GFX12-NEXT: s_mul_hi_u32 s34, s6, s8 1989; GFX12-NEXT: s_add_co_u32 s21, s23, s21 1990; GFX12-NEXT: s_mul_i32 s23, s0, s13 1991; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22 1992; GFX12-NEXT: s_mul_hi_u32 s34, s0, s13 1993; GFX12-NEXT: s_add_co_u32 s23, s23, s24 1994; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21 1995; GFX12-NEXT: s_mul_i32 s34, s1, s12 1996; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12 1997; GFX12-NEXT: s_cselect_b32 s24, 1, 0 1998; GFX12-NEXT: s_add_co_u32 s23, s34, s23 1999; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21 2000; GFX12-NEXT: s_mul_i32 s35, s2, s11 2001; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11 2002; GFX12-NEXT: s_cselect_b32 s34, 1, 0 2003; GFX12-NEXT: s_add_co_u32 s23, s35, s23 2004; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21 2005; GFX12-NEXT: s_mul_i32 s36, s3, s10 2006; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10 2007; GFX12-NEXT: s_cselect_b32 s35, 1, 0 2008; GFX12-NEXT: s_add_co_u32 s23, s36, s23 2009; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21 2010; GFX12-NEXT: s_mul_i32 s37, s4, s9 2011; GFX12-NEXT: s_mul_hi_u32 s38, s4, s9 2012; GFX12-NEXT: s_cselect_b32 s36, 1, 0 2013; GFX12-NEXT: s_add_co_u32 s23, s37, s23 2014; GFX12-NEXT: s_add_co_ci_u32 s21, s38, s21 2015; GFX12-NEXT: s_mul_i32 s38, s5, s8 2016; GFX12-NEXT: s_mul_hi_u32 s39, s5, s8 2017; GFX12-NEXT: s_cselect_b32 s37, 1, 0 2018; GFX12-NEXT: s_add_co_u32 s23, s38, s23 2019; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21 2020; GFX12-NEXT: s_cselect_b32 s38, 1, 0 2021; GFX12-NEXT: s_cmp_lg_u32 s30, 0 2022; GFX12-NEXT: s_mul_i32 s1, s1, s14 2023; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 2024; GFX12-NEXT: s_cmp_lg_u32 s31, 0 2025; GFX12-NEXT: s_mul_i32 s2, s2, s13 2026; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 2027; GFX12-NEXT: s_cmp_lg_u32 s33, 0 2028; GFX12-NEXT: s_mul_i32 s3, s3, s12 2029; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0 2030; GFX12-NEXT: s_cmp_lg_u32 s20, 0 2031; GFX12-NEXT: s_mul_i32 s4, s4, s11 2032; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23 2033; GFX12-NEXT: s_cselect_b32 s23, 1, 0 2034; GFX12-NEXT: s_cmp_lg_u32 s26, 0 2035; GFX12-NEXT: s_mul_i32 s26, s0, s15 2036; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 2037; GFX12-NEXT: s_cmp_lg_u32 s27, 0 2038; GFX12-NEXT: s_mul_i32 s5, s5, s10 2039; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 2040; GFX12-NEXT: s_cmp_lg_u32 s28, 0 2041; GFX12-NEXT: s_mul_i32 s6, s6, s9 2042; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0 2043; GFX12-NEXT: s_cmp_lg_u32 s23, 0 2044; GFX12-NEXT: s_mul_i32 s7, s7, s8 2045; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21 2046; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26 2047; GFX12-NEXT: s_cmp_lg_u32 s38, 0 2048; GFX12-NEXT: s_mul_i32 s0, s0, s8 2049; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1 2050; GFX12-NEXT: s_cmp_lg_u32 s37, 0 2051; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2 2052; GFX12-NEXT: s_cmp_lg_u32 s36, 0 2053; GFX12-NEXT: s_mov_b32 s2, s17 2054; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3 2055; GFX12-NEXT: s_cmp_lg_u32 s35, 0 2056; GFX12-NEXT: s_mov_b32 s3, s18 2057; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s4 2058; GFX12-NEXT: s_cmp_lg_u32 s34, 0 2059; GFX12-NEXT: s_mov_b32 s4, s19 2060; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s5 2061; GFX12-NEXT: s_cmp_lg_u32 s24, 0 2062; GFX12-NEXT: s_mov_b32 s5, s20 2063; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6 2064; GFX12-NEXT: s_mov_b32 s6, s15 2065; GFX12-NEXT: s_add_co_i32 s7, s1, s7 2066; GFX12-NEXT: s_mov_b32 s1, s16 2067; GFX12-NEXT: ; return to shader part epilog 2068 %result = mul i256 %num, %den 2069 %cast = bitcast i256 %result to <8 x i32> 2070 ret <8 x i32> %cast 2071} 2072 2073define i256 @v_mul_i256(i256 %num, i256 %den) { 2074; GFX7-LABEL: v_mul_i256: 2075; GFX7: ; %bb.0: 2076; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2077; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 2078; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 2079; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] 2080; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11 2081; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10 2082; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] 2083; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] 2084; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] 2085; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] 2086; GFX7-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] 2087; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] 2088; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] 2089; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc 2090; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] 2091; GFX7-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc 2092; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 2093; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] 2094; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] 2095; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] 2096; GFX7-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc 2097; GFX7-NEXT: v_mov_b32_e32 v20, v18 2098; GFX7-NEXT: v_mov_b32_e32 v18, v19 2099; GFX7-NEXT: v_mov_b32_e32 v19, v16 2100; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] 2101; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9 2102; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] 2103; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] 2104; GFX7-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] 2105; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] 2106; GFX7-NEXT: v_mov_b32_e32 v19, v22 2107; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] 2108; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] 2109; GFX7-NEXT: v_mul_lo_u32 v24, v3, v12 2110; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] 2111; GFX7-NEXT: v_mul_lo_u32 v22, v2, v13 2112; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] 2113; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] 2114; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] 2115; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] 2116; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] 2117; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 2118; GFX7-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] 2119; GFX7-NEXT: v_mov_b32_e32 v20, v11 2120; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] 2121; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] 2122; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] 2123; GFX7-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] 2124; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14 2125; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] 2126; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] 2127; GFX7-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] 2128; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15 2129; GFX7-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] 2130; GFX7-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] 2131; GFX7-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] 2132; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] 2133; GFX7-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] 2134; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] 2135; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] 2136; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] 2137; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] 2138; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc 2139; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] 2140; GFX7-NEXT: v_mov_b32_e32 v0, v10 2141; GFX7-NEXT: s_setpc_b64 s[30:31] 2142; 2143; GFX8-LABEL: v_mul_i256: 2144; GFX8: ; %bb.0: 2145; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2146; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 2147; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 2148; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] 2149; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11 2150; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10 2151; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] 2152; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] 2153; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] 2154; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] 2155; GFX8-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] 2156; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] 2157; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] 2158; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc 2159; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] 2160; GFX8-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc 2161; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 2162; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] 2163; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] 2164; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] 2165; GFX8-NEXT: v_addc_u32_e32 v25, vcc, 0, v20, vcc 2166; GFX8-NEXT: v_mov_b32_e32 v20, v18 2167; GFX8-NEXT: v_mov_b32_e32 v18, v19 2168; GFX8-NEXT: v_mov_b32_e32 v19, v16 2169; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] 2170; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9 2171; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] 2172; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] 2173; GFX8-NEXT: v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5] 2174; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] 2175; GFX8-NEXT: v_mov_b32_e32 v19, v22 2176; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] 2177; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] 2178; GFX8-NEXT: v_mul_lo_u32 v24, v3, v12 2179; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] 2180; GFX8-NEXT: v_mul_lo_u32 v22, v2, v13 2181; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] 2182; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] 2183; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] 2184; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13] 2185; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] 2186; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 2187; GFX8-NEXT: v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13] 2188; GFX8-NEXT: v_mov_b32_e32 v20, v11 2189; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] 2190; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] 2191; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] 2192; GFX8-NEXT: v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13] 2193; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14 2194; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] 2195; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] 2196; GFX8-NEXT: v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13] 2197; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15 2198; GFX8-NEXT: v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13] 2199; GFX8-NEXT: v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13] 2200; GFX8-NEXT: v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13] 2201; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13] 2202; GFX8-NEXT: v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15] 2203; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11] 2204; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9] 2205; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7] 2206; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5] 2207; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc 2208; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] 2209; GFX8-NEXT: v_mov_b32_e32 v0, v10 2210; GFX8-NEXT: s_setpc_b64 s[30:31] 2211; 2212; GFX9-LABEL: v_mul_i256: 2213; GFX9: ; %bb.0: 2214; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2215; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0 2216; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0 2217; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17] 2218; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11 2219; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10 2220; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17] 2221; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17] 2222; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17] 2223; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19] 2224; GFX9-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] 2225; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17] 2226; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19] 2227; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc 2228; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19] 2229; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc 2230; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0 2231; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19] 2232; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17] 2233; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22] 2234; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v20, vcc 2235; GFX9-NEXT: v_mov_b32_e32 v20, v18 2236; GFX9-NEXT: v_mov_b32_e32 v18, v19 2237; GFX9-NEXT: v_mov_b32_e32 v19, v16 2238; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19] 2239; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9 2240; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] 2241; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22] 2242; GFX9-NEXT: v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5] 2243; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19] 2244; GFX9-NEXT: v_mov_b32_e32 v19, v22 2245; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20] 2246; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24] 2247; GFX9-NEXT: v_mul_lo_u32 v24, v3, v12 2248; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23] 2249; GFX9-NEXT: v_mul_lo_u32 v22, v2, v13 2250; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12] 2251; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] 2252; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19] 2253; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13] 2254; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11] 2255; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0 2256; GFX9-NEXT: v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13] 2257; GFX9-NEXT: v_mov_b32_e32 v20, v11 2258; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21] 2259; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19] 2260; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13] 2261; GFX9-NEXT: v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13] 2262; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14 2263; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17] 2264; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21] 2265; GFX9-NEXT: v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13] 2266; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15 2267; GFX9-NEXT: v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13] 2268; GFX9-NEXT: v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13] 2269; GFX9-NEXT: v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13] 2270; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13] 2271; GFX9-NEXT: v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15] 2272; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11] 2273; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9] 2274; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7] 2275; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5] 2276; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc 2277; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1] 2278; GFX9-NEXT: v_mov_b32_e32 v0, v10 2279; GFX9-NEXT: s_setpc_b64 s[30:31] 2280; 2281; GFX10-LABEL: v_mul_i256: 2282; GFX10: ; %bb.0: 2283; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2284; GFX10-NEXT: v_mov_b32_e32 v16, v0 2285; GFX10-NEXT: v_mov_b32_e32 v17, v1 2286; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9 2287; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10 2288; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0 2289; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0 2290; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14 2291; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1] 2292; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1] 2293; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19] 2294; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4 2295; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1] 2296; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] 2297; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo 2298; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0 2299; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1] 2300; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] 2301; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo 2302; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1] 2303; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] 2304; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo 2305; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1] 2306; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21] 2307; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4 2308; GFX10-NEXT: v_mov_b32_e32 v20, v22 2309; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] 2310; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo 2311; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20] 2312; GFX10-NEXT: v_mov_b32_e32 v20, v18 2313; GFX10-NEXT: v_mov_b32_e32 v19, v22 2314; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15 2315; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] 2316; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20] 2317; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0 2318; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11 2319; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 2320; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25] 2321; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12 2322; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15] 2323; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 2324; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13 2325; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19] 2326; GFX10-NEXT: v_mov_b32_e32 v13, v1 2327; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12] 2328; GFX10-NEXT: v_mov_b32_e32 v14, v21 2329; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6 2330; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19] 2331; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14] 2332; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8 2333; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2] 2334; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8 2335; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11] 2336; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13] 2337; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9 2338; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9 2339; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9 2340; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9 2341; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9 2342; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8 2343; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6 2344; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7 2345; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5 2346; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo 2347; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4 2348; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10] 2349; GFX10-NEXT: s_setpc_b64 s[30:31] 2350; 2351; GFX11-LABEL: v_mul_i256: 2352; GFX11: ; %bb.0: 2353; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2354; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 2355; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7 2356; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11 2357; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0 2358; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0 2359; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14 2360; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10 2361; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1] 2362; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8] 2363; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 2364; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1] 2365; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8] 2366; GFX11-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo 2367; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0 2368; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1] 2369; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8] 2370; GFX11-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo 2371; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1] 2372; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8] 2373; GFX11-NEXT: v_add_co_ci_u32_e32 v27, vcc_lo, 0, v24, vcc_lo 2374; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1] 2375; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1] 2376; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21] 2377; GFX11-NEXT: v_mov_b32_e32 v20, v8 2378; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0 2379; GFX11-NEXT: v_mov_b32_e32 v21, v22 2380; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9 2381; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1] 2382; GFX11-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo 2383; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21] 2384; GFX11-NEXT: v_mov_b32_e32 v6, v25 2385; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15 2386; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1] 2387; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7] 2388; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0 2389; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2 2390; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21] 2391; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7] 2392; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13 2393; GFX11-NEXT: v_add_co_ci_u32_e64 v8, s2, 0, v8, s2 2394; GFX11-NEXT: v_mov_b32_e32 v11, v1 2395; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15] 2396; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7] 2397; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12 2398; GFX11-NEXT: v_mov_b32_e32 v12, v24 2399; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s2, 0, v8, s2 2400; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14] 2401; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12] 2402; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4 2403; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2] 2404; GFX11-NEXT: v_add_co_ci_u32_e64 v10, s4, 0, v10, s4 2405; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7] 2406; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9] 2407; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5 2408; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5 2409; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5 2410; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5 2411; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s5, v23, v25, s5 2412; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v29, s4 2413; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s2, v7, v20, s2 2414; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s2, v7, v21, s3 2415; GFX11-NEXT: v_add_co_ci_u32_e64 v7, s1, v7, v30, s1 2416; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v28, vcc_lo 2417; GFX11-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v7, v22, s0 2418; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10] 2419; GFX11-NEXT: s_setpc_b64 s[30:31] 2420; 2421; GFX12-LABEL: v_mul_i256: 2422; GFX12: ; %bb.0: 2423; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 2424; GFX12-NEXT: s_wait_expcnt 0x0 2425; GFX12-NEXT: s_wait_samplecnt 0x0 2426; GFX12-NEXT: s_wait_bvhcnt 0x0 2427; GFX12-NEXT: s_wait_kmcnt 0x0 2428; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 2429; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9 2430; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10 2431; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 2432; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0 2433; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 2434; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 2435; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] 2436; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) 2437; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] 2438; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 2439; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2440; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] 2441; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] 2442; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 2443; GFX12-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo 2444; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 2445; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] 2446; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 2447; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] 2448; GFX12-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo 2449; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2450; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] 2451; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] 2452; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2453; GFX12-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo 2454; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] 2455; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2456; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] 2457; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] 2458; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 2459; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2460; GFX12-NEXT: v_mov_b32_e32 v20, v22 2461; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] 2462; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2463; GFX12-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo 2464; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] 2465; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 2466; GFX12-NEXT: v_mov_b32_e32 v19, v22 2467; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 2468; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1] 2469; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 2470; GFX12-NEXT: v_mov_b32_e32 v20, v18 2471; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) 2472; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20] 2473; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25] 2474; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11 2475; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12 2476; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2 2477; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13 2478; GFX12-NEXT: v_mov_b32_e32 v13, v1 2479; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15] 2480; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19] 2481; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 2482; GFX12-NEXT: v_mov_b32_e32 v14, v21 2483; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 2484; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] 2485; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s2, 0, v6, s2 2486; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19] 2487; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) 2488; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14] 2489; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 2490; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2] 2491; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2492; GFX12-NEXT: v_add_co_ci_u32_e64 v14, s4, 0, v6, s4 2493; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] 2494; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] 2495; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 2496; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2497; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 2498; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5 2499; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2500; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5 2501; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s5, v23, v22, s5 2502; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2503; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s4, v9, v30, s4 2504; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v24, s2 2505; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2506; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s2, v9, v25, s3 2507; GFX12-NEXT: v_add_co_ci_u32_e64 v9, s1, v9, v20, s1 2508; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2509; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo 2510; GFX12-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0 2511; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2512; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] 2513; GFX12-NEXT: s_setpc_b64 s[30:31] 2514 %result = mul i256 %num, %den 2515 ret i256 %result 2516} 2517 2518define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { 2519; GFX7-LABEL: s_mul_u64_zext_with_vregs: 2520; GFX7: ; %bb.0: 2521; GFX7-NEXT: s_mov_b32 s2, 0 2522; GFX7-NEXT: s_mov_b32 s3, 0xf000 2523; GFX7-NEXT: s_mov_b64 s[0:1], 0 2524; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64 2525; GFX7-NEXT: v_mov_b32_e32 v3, 0x50 2526; GFX7-NEXT: s_waitcnt vmcnt(0) 2527; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0 2528; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 2529; GFX7-NEXT: s_endpgm 2530; 2531; GFX8-LABEL: s_mul_u64_zext_with_vregs: 2532; GFX8: ; %bb.0: 2533; GFX8-NEXT: flat_load_dword v2, v[2:3] 2534; GFX8-NEXT: v_mov_b32_e32 v3, 0x50 2535; GFX8-NEXT: s_waitcnt vmcnt(0) 2536; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 2537; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2538; GFX8-NEXT: s_endpgm 2539; 2540; GFX9-LABEL: s_mul_u64_zext_with_vregs: 2541; GFX9: ; %bb.0: 2542; GFX9-NEXT: global_load_dword v2, v[2:3], off 2543; GFX9-NEXT: v_mov_b32_e32 v3, 0x50 2544; GFX9-NEXT: s_waitcnt vmcnt(0) 2545; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0 2546; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 2547; GFX9-NEXT: s_endpgm 2548; 2549; GFX10-LABEL: s_mul_u64_zext_with_vregs: 2550; GFX10: ; %bb.0: 2551; GFX10-NEXT: global_load_dword v2, v[2:3], off 2552; GFX10-NEXT: s_waitcnt vmcnt(0) 2553; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0 2554; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 2555; GFX10-NEXT: s_endpgm 2556; 2557; GFX11-LABEL: s_mul_u64_zext_with_vregs: 2558; GFX11: ; %bb.0: 2559; GFX11-NEXT: global_load_b32 v2, v[2:3], off 2560; GFX11-NEXT: s_waitcnt vmcnt(0) 2561; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0 2562; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off 2563; GFX11-NEXT: s_endpgm 2564; 2565; GFX12-LABEL: s_mul_u64_zext_with_vregs: 2566; GFX12: ; %bb.0: 2567; GFX12-NEXT: global_load_b32 v2, v[2:3], off 2568; GFX12-NEXT: s_wait_loadcnt 0x0 2569; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0 2570; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off 2571; GFX12-NEXT: s_endpgm 2572 %val = load i32, ptr addrspace(1) %in, align 4 2573 %ext = zext i32 %val to i64 2574 %mul = mul i64 %ext, 80 2575 store i64 %mul, ptr addrspace(1) %out, align 8 2576 ret void 2577} 2578 2579define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { 2580; GFX7-LABEL: s_mul_u64_zext_with_sregs: 2581; GFX7: ; %bb.0: 2582; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2583; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 2584; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2585; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 2586; GFX7-NEXT: s_mov_b32 s2, -1 2587; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2588; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0 2589; GFX7-NEXT: s_mul_i32 s4, s3, 0x50 2590; GFX7-NEXT: s_mov_b32 s3, 0xf000 2591; GFX7-NEXT: v_readfirstlane_b32 s5, v0 2592; GFX7-NEXT: v_mov_b32_e32 v0, s4 2593; GFX7-NEXT: v_mov_b32_e32 v1, s5 2594; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2595; GFX7-NEXT: s_endpgm 2596; 2597; GFX8-LABEL: s_mul_u64_zext_with_sregs: 2598; GFX8: ; %bb.0: 2599; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2600; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 2601; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2602; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 2603; GFX8-NEXT: v_mov_b32_e32 v3, s1 2604; GFX8-NEXT: v_mov_b32_e32 v2, s0 2605; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2606; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 2607; GFX8-NEXT: s_mulk_i32 s2, 0x50 2608; GFX8-NEXT: v_readfirstlane_b32 s3, v0 2609; GFX8-NEXT: v_mov_b32_e32 v0, s2 2610; GFX8-NEXT: v_mov_b32_e32 v1, s3 2611; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2612; GFX8-NEXT: s_endpgm 2613; 2614; GFX9-LABEL: s_mul_u64_zext_with_sregs: 2615; GFX9: ; %bb.0: 2616; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2617; GFX9-NEXT: v_mov_b32_e32 v2, 0 2618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2619; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 2620; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2621; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 2622; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 2623; GFX9-NEXT: v_mov_b32_e32 v0, s2 2624; GFX9-NEXT: v_mov_b32_e32 v1, s3 2625; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2626; GFX9-NEXT: s_endpgm 2627; 2628; GFX10-LABEL: s_mul_u64_zext_with_sregs: 2629; GFX10: ; %bb.0: 2630; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2631; GFX10-NEXT: v_mov_b32_e32 v2, 0 2632; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2633; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0 2634; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2635; GFX10-NEXT: s_mul_i32 s2, s3, 0x50 2636; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50 2637; GFX10-NEXT: v_mov_b32_e32 v0, s2 2638; GFX10-NEXT: v_mov_b32_e32 v1, s3 2639; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2640; GFX10-NEXT: s_endpgm 2641; 2642; GFX11-LABEL: s_mul_u64_zext_with_sregs: 2643; GFX11: ; %bb.0: 2644; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2645; GFX11-NEXT: v_mov_b32_e32 v2, 0 2646; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2647; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0 2648; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2649; GFX11-NEXT: s_mul_i32 s2, s3, 0x50 2650; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50 2651; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2652; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 2653; GFX11-NEXT: s_endpgm 2654; 2655; GFX12-LABEL: s_mul_u64_zext_with_sregs: 2656; GFX12: ; %bb.0: 2657; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2658; GFX12-NEXT: v_mov_b32_e32 v2, 0 2659; GFX12-NEXT: s_wait_kmcnt 0x0 2660; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 2661; GFX12-NEXT: s_mov_b32 s3, 0 2662; GFX12-NEXT: s_wait_kmcnt 0x0 2663; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 2664; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2665; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2666; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 2667; GFX12-NEXT: s_endpgm 2668 %val = load i32, ptr addrspace(1) %in, align 4 2669 %ext = zext i32 %val to i64 2670 %mul = mul i64 %ext, 80 2671 store i64 %mul, ptr addrspace(1) %out, align 8 2672 ret void 2673} 2674 2675define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { 2676; GFX7-LABEL: s_mul_u64_sext_with_vregs: 2677; GFX7: ; %bb.0: 2678; GFX7-NEXT: s_mov_b32 s2, 0 2679; GFX7-NEXT: s_mov_b32 s3, 0xf000 2680; GFX7-NEXT: s_mov_b64 s[0:1], 0 2681; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64 2682; GFX7-NEXT: v_mov_b32_e32 v5, 0x50 2683; GFX7-NEXT: s_waitcnt vmcnt(0) 2684; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0 2685; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4 2686; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4] 2687; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 2688; GFX7-NEXT: s_endpgm 2689; 2690; GFX8-LABEL: s_mul_u64_sext_with_vregs: 2691; GFX8: ; %bb.0: 2692; GFX8-NEXT: flat_load_dword v4, v[2:3] 2693; GFX8-NEXT: v_mov_b32_e32 v5, 0x50 2694; GFX8-NEXT: s_waitcnt vmcnt(0) 2695; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 2696; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4 2697; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] 2698; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 2699; GFX8-NEXT: s_endpgm 2700; 2701; GFX9-LABEL: s_mul_u64_sext_with_vregs: 2702; GFX9: ; %bb.0: 2703; GFX9-NEXT: global_load_dword v4, v[2:3], off 2704; GFX9-NEXT: v_mov_b32_e32 v5, 0x50 2705; GFX9-NEXT: s_waitcnt vmcnt(0) 2706; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0 2707; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4 2708; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4] 2709; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 2710; GFX9-NEXT: s_endpgm 2711; 2712; GFX10-LABEL: s_mul_u64_sext_with_vregs: 2713; GFX10: ; %bb.0: 2714; GFX10-NEXT: global_load_dword v4, v[2:3], off 2715; GFX10-NEXT: s_waitcnt vmcnt(0) 2716; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0 2717; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4 2718; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4] 2719; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off 2720; GFX10-NEXT: s_endpgm 2721; 2722; GFX11-LABEL: s_mul_u64_sext_with_vregs: 2723; GFX11: ; %bb.0: 2724; GFX11-NEXT: global_load_b32 v4, v[2:3], off 2725; GFX11-NEXT: s_waitcnt vmcnt(0) 2726; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0 2727; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4 2728; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4] 2729; GFX11-NEXT: v_mov_b32_e32 v3, v4 2730; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off 2731; GFX11-NEXT: s_endpgm 2732; 2733; GFX12-LABEL: s_mul_u64_sext_with_vregs: 2734; GFX12: ; %bb.0: 2735; GFX12-NEXT: global_load_b32 v2, v[2:3], off 2736; GFX12-NEXT: s_wait_loadcnt 0x0 2737; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0 2738; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off 2739; GFX12-NEXT: s_endpgm 2740 %val = load i32, ptr addrspace(1) %in, align 4 2741 %ext = sext i32 %val to i64 2742 %mul = mul i64 %ext, 80 2743 store i64 %mul, ptr addrspace(1) %out, align 8 2744 ret void 2745} 2746 2747define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) { 2748; GFX7-LABEL: s_mul_u64_sext_with_sregs: 2749; GFX7: ; %bb.0: 2750; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2751; GFX7-NEXT: v_mov_b32_e32 v0, 0x50 2752; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2753; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0 2754; GFX7-NEXT: s_mov_b32 s2, -1 2755; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2756; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0 2757; GFX7-NEXT: s_ashr_i32 s5, s3, 31 2758; GFX7-NEXT: s_mul_i32 s4, s3, 0x50 2759; GFX7-NEXT: s_mulk_i32 s5, 0x50 2760; GFX7-NEXT: v_readfirstlane_b32 s3, v0 2761; GFX7-NEXT: s_add_u32 s5, s5, s3 2762; GFX7-NEXT: v_mov_b32_e32 v0, s4 2763; GFX7-NEXT: v_mov_b32_e32 v1, s5 2764; GFX7-NEXT: s_mov_b32 s3, 0xf000 2765; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2766; GFX7-NEXT: s_endpgm 2767; 2768; GFX8-LABEL: s_mul_u64_sext_with_sregs: 2769; GFX8: ; %bb.0: 2770; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2771; GFX8-NEXT: v_mov_b32_e32 v0, 0x50 2772; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2773; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 2774; GFX8-NEXT: v_mov_b32_e32 v3, s1 2775; GFX8-NEXT: v_mov_b32_e32 v2, s0 2776; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2777; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 2778; GFX8-NEXT: s_ashr_i32 s3, s2, 31 2779; GFX8-NEXT: s_mulk_i32 s2, 0x50 2780; GFX8-NEXT: s_mulk_i32 s3, 0x50 2781; GFX8-NEXT: v_readfirstlane_b32 s4, v0 2782; GFX8-NEXT: s_add_u32 s3, s3, s4 2783; GFX8-NEXT: v_mov_b32_e32 v0, s2 2784; GFX8-NEXT: v_mov_b32_e32 v1, s3 2785; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2786; GFX8-NEXT: s_endpgm 2787; 2788; GFX9-LABEL: s_mul_u64_sext_with_sregs: 2789; GFX9: ; %bb.0: 2790; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2791; GFX9-NEXT: v_mov_b32_e32 v2, 0 2792; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2793; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0 2794; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2795; GFX9-NEXT: s_ashr_i32 s4, s3, 31 2796; GFX9-NEXT: s_mul_i32 s2, s3, 0x50 2797; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50 2798; GFX9-NEXT: s_mulk_i32 s4, 0x50 2799; GFX9-NEXT: s_add_u32 s3, s4, s3 2800; GFX9-NEXT: v_mov_b32_e32 v0, s2 2801; GFX9-NEXT: v_mov_b32_e32 v1, s3 2802; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2803; GFX9-NEXT: s_endpgm 2804; 2805; GFX10-LABEL: s_mul_u64_sext_with_sregs: 2806; GFX10: ; %bb.0: 2807; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2808; GFX10-NEXT: v_mov_b32_e32 v2, 0 2809; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2810; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 2811; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2812; GFX10-NEXT: s_ashr_i32 s3, s2, 31 2813; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50 2814; GFX10-NEXT: s_mulk_i32 s3, 0x50 2815; GFX10-NEXT: s_mulk_i32 s2, 0x50 2816; GFX10-NEXT: s_add_i32 s3, s4, s3 2817; GFX10-NEXT: v_mov_b32_e32 v0, s2 2818; GFX10-NEXT: v_mov_b32_e32 v1, s3 2819; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 2820; GFX10-NEXT: s_endpgm 2821; 2822; GFX11-LABEL: s_mul_u64_sext_with_sregs: 2823; GFX11: ; %bb.0: 2824; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2825; GFX11-NEXT: v_mov_b32_e32 v2, 0 2826; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 2828; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2829; GFX11-NEXT: s_ashr_i32 s3, s2, 31 2830; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50 2831; GFX11-NEXT: s_mulk_i32 s3, 0x50 2832; GFX11-NEXT: s_mulk_i32 s2, 0x50 2833; GFX11-NEXT: s_add_i32 s3, s4, s3 2834; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2835; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 2836; GFX11-NEXT: s_endpgm 2837; 2838; GFX12-LABEL: s_mul_u64_sext_with_sregs: 2839; GFX12: ; %bb.0: 2840; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2841; GFX12-NEXT: v_mov_b32_e32 v2, 0 2842; GFX12-NEXT: s_wait_kmcnt 0x0 2843; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 2844; GFX12-NEXT: s_wait_kmcnt 0x0 2845; GFX12-NEXT: s_ashr_i32 s3, s2, 31 2846; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2847; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 2848; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 2849; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 2850; GFX12-NEXT: s_endpgm 2851 %val = load i32, ptr addrspace(1) %in, align 4 2852 %ext = sext i32 %val to i64 2853 %mul = mul i64 %ext, 80 2854 store i64 %mul, ptr addrspace(1) %out, align 8 2855 ret void 2856} 2857