1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s 3; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s 4 5; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare. 6 7define i32 @v_udiv_i32(i32 %num, i32 %den) { 8; GISEL-LABEL: v_udiv_i32: 9; GISEL: ; %bb.0: 10; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 12; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 13; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 14; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 15; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 16; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 17; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3 18; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 19; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2 20; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1 21; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2 22; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 23; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 24; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 25; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 26; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 27; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2 28; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 29; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 30; GISEL-NEXT: s_setpc_b64 s[30:31] 31; 32; CGP-LABEL: v_udiv_i32: 33; CGP: ; %bb.0: 34; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 35; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 36; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 37; CGP-NEXT: v_rcp_f32_e32 v2, v2 38; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 39; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 40; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 41; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 42; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 43; CGP-NEXT: v_mul_hi_u32 v2, v0, v2 44; CGP-NEXT: v_mul_lo_u32 v3, v2, v1 45; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 46; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 47; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 48; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 49; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 50; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 51; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2 52; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 53; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 54; CGP-NEXT: s_setpc_b64 s[30:31] 55 %result = udiv i32 %num, %den 56 ret i32 %result 57} 58 59; FIXME: This is a workaround for not handling uniform VGPR case. 60declare i32 @llvm.amdgcn.readfirstlane(i32) 61 62define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) { 63; GISEL-LABEL: s_udiv_i32: 64; GISEL: ; %bb.0: 65; GISEL-NEXT: v_cvt_f32_u32_e32 v0, s1 66; GISEL-NEXT: s_sub_i32 s2, 0, s1 67; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 68; GISEL-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 69; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0 70; GISEL-NEXT: v_mul_lo_u32 v1, s2, v0 71; GISEL-NEXT: v_mul_hi_u32 v1, v0, v1 72; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 73; GISEL-NEXT: v_mul_hi_u32 v0, s0, v0 74; GISEL-NEXT: v_mul_lo_u32 v1, v0, s1 75; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 76; GISEL-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 77; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 78; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 79; GISEL-NEXT: v_subrev_i32_e64 v2, s[2:3], s1, v1 80; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 81; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0 82; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 83; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 84; GISEL-NEXT: v_readfirstlane_b32 s0, v0 85; GISEL-NEXT: ; return to shader part epilog 86; 87; CGP-LABEL: s_udiv_i32: 88; CGP: ; %bb.0: 89; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1 90; CGP-NEXT: s_sub_i32 s2, 0, s1 91; CGP-NEXT: v_rcp_f32_e32 v0, v0 92; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 93; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 94; CGP-NEXT: v_mul_lo_u32 v1, s2, v0 95; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 96; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 97; CGP-NEXT: v_mul_hi_u32 v0, s0, v0 98; CGP-NEXT: v_mul_lo_u32 v1, v0, s1 99; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 100; CGP-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 101; CGP-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 102; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 103; CGP-NEXT: v_subrev_i32_e64 v2, s[2:3], s1, v1 104; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 105; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 106; CGP-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 107; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 108; CGP-NEXT: v_readfirstlane_b32 s0, v0 109; CGP-NEXT: ; return to shader part epilog 110 %result = udiv i32 %num, %den 111 %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result) 112 ret i32 %readlane 113} 114 115define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) { 116; GISEL-LABEL: v_udiv_v2i32: 117; GISEL: ; %bb.0: 118; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 120; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 121; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 122; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 123; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 124; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 125; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 126; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 127; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 128; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 129; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 130; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6 131; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 132; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 133; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 134; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 135; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 136; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 137; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 138; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4 139; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3 140; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5 141; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 142; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 143; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 144; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 145; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 146; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 147; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 148; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3 149; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 150; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 151; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 152; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 153; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 154; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 155; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 156; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 157; GISEL-NEXT: s_setpc_b64 s[30:31] 158; 159; CGP-LABEL: v_udiv_v2i32: 160; CGP: ; %bb.0: 161; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 162; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 163; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 164; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 165; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 166; CGP-NEXT: v_rcp_f32_e32 v4, v4 167; CGP-NEXT: v_rcp_f32_e32 v6, v6 168; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 169; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 170; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 171; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 172; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 173; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 174; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 175; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 176; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 177; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 178; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 179; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 180; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 181; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 182; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 183; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 184; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 185; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 186; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 187; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 188; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 189; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 190; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 191; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3 192; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 193; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4 194; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 195; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 196; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 197; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 198; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 199; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 200; CGP-NEXT: s_setpc_b64 s[30:31] 201 %result = udiv <2 x i32> %num, %den 202 ret <2 x i32> %result 203} 204 205define i32 @v_udiv_i32_pow2k_denom(i32 %num) { 206; CHECK-LABEL: v_udiv_i32_pow2k_denom: 207; CHECK: ; %bb.0: 208; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 209; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0 210; CHECK-NEXT: s_setpc_b64 s[30:31] 211 %result = udiv i32 %num, 4096 212 ret i32 %result 213} 214 215define <2 x i32> @v_udiv_v2i32_pow2k_denom(<2 x i32> %num) { 216; CHECK-LABEL: v_udiv_v2i32_pow2k_denom: 217; CHECK: ; %bb.0: 218; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 219; CHECK-NEXT: v_lshrrev_b32_e32 v0, 12, v0 220; CHECK-NEXT: v_lshrrev_b32_e32 v1, 12, v1 221; CHECK-NEXT: s_setpc_b64 s[30:31] 222 %result = udiv <2 x i32> %num, <i32 4096, i32 4096> 223 ret <2 x i32> %result 224} 225 226define i32 @v_udiv_i32_oddk_denom(i32 %num) { 227; CHECK-LABEL: v_udiv_i32_oddk_denom: 228; CHECK: ; %bb.0: 229; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; CHECK-NEXT: v_mov_b32_e32 v1, 0xb2a50881 231; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 232; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 233; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 234; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 235; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 236; CHECK-NEXT: s_setpc_b64 s[30:31] 237 %result = udiv i32 %num, 1235195 238 ret i32 %result 239} 240 241define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) { 242; CHECK-LABEL: v_udiv_v2i32_oddk_denom: 243; CHECK: ; %bb.0: 244; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 245; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881 246; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2 247; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 248; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 249; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 250; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0 251; CHECK-NEXT: v_lshrrev_b32_e32 v1, 1, v1 252; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 253; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 254; CHECK-NEXT: v_lshrrev_b32_e32 v0, 20, v0 255; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 256; CHECK-NEXT: s_setpc_b64 s[30:31] 257 %result = udiv <2 x i32> %num, <i32 1235195, i32 1235195> 258 ret <2 x i32> %result 259} 260 261define i32 @v_udiv_i32_pow2_shl_denom(i32 %x, i32 %y) { 262; CHECK-LABEL: v_udiv_i32_pow2_shl_denom: 263; CHECK: ; %bb.0: 264; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 265; CHECK-NEXT: v_lshl_b32_e32 v1, 0x1000, v1 266; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 267; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 268; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 269; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 270; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 271; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2 272; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 273; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 274; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 275; CHECK-NEXT: v_mul_lo_u32 v3, v2, v1 276; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 277; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 278; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 279; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 280; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 281; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 282; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 283; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 284; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 285; CHECK-NEXT: s_setpc_b64 s[30:31] 286 %shl.y = shl i32 4096, %y 287 %r = udiv i32 %x, %shl.y 288 ret i32 %r 289} 290 291define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) { 292; GISEL-LABEL: v_udiv_v2i32_pow2_shl_denom: 293; GISEL: ; %bb.0: 294; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 295; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 296; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 297; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 298; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 299; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 300; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 301; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 302; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 303; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 304; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 305; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 306; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 307; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 308; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6 309; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 310; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 311; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 312; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 313; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 314; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 315; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 316; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4 317; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3 318; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5 319; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 320; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 321; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 322; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 323; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 324; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 325; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 326; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3 327; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 328; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 329; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 330; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 331; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 332; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 333; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 334; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 335; GISEL-NEXT: s_setpc_b64 s[30:31] 336; 337; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom: 338; CGP: ; %bb.0: 339; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 340; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2 341; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3 342; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 343; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 344; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 345; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 346; CGP-NEXT: v_rcp_f32_e32 v4, v4 347; CGP-NEXT: v_rcp_f32_e32 v6, v6 348; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 349; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 350; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 351; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 352; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 353; CGP-NEXT: v_mul_lo_u32 v7, v7, v6 354; CGP-NEXT: v_mul_hi_u32 v5, v4, v5 355; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 356; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 357; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7 358; CGP-NEXT: v_mul_hi_u32 v4, v0, v4 359; CGP-NEXT: v_mul_hi_u32 v5, v1, v5 360; CGP-NEXT: v_mul_lo_u32 v6, v4, v2 361; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 362; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 363; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5 364; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 365; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 366; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 367; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 368; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 369; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 370; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 371; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3 372; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 373; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4 374; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 375; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5 376; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 377; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 378; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 379; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 380; CGP-NEXT: s_setpc_b64 s[30:31] 381 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 382 %r = udiv <2 x i32> %x, %shl.y 383 ret <2 x i32> %r 384} 385 386define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) { 387; GISEL-LABEL: v_udiv_i32_24bit: 388; GISEL: ; %bb.0: 389; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 390; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 391; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 392; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1 393; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 394; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2 395; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 396; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2 397; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 398; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3 399; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 400; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2 401; GISEL-NEXT: v_mul_lo_u32 v3, v2, v1 402; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2 403; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 404; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 405; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 406; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1 407; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 408; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2 409; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 410; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 411; GISEL-NEXT: s_setpc_b64 s[30:31] 412; 413; CGP-LABEL: v_udiv_i32_24bit: 414; CGP: ; %bb.0: 415; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 416; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 417; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 418; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 419; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 420; CGP-NEXT: v_rcp_f32_e32 v2, v1 421; CGP-NEXT: v_mul_f32_e32 v2, v0, v2 422; CGP-NEXT: v_trunc_f32_e32 v2, v2 423; CGP-NEXT: v_fma_f32 v0, -v2, v1, v0 424; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 425; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1 426; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 427; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v0 428; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 429; CGP-NEXT: s_setpc_b64 s[30:31] 430 %num.mask = and i32 %num, 16777215 431 %den.mask = and i32 %den, 16777215 432 %result = udiv i32 %num.mask, %den.mask 433 ret i32 %result 434} 435 436define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { 437; GISEL-LABEL: v_udiv_v2i32_24bit: 438; GISEL: ; %bb.0: 439; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 440; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 441; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 442; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2 443; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3 444; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2 445; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2 446; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3 447; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 448; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 449; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 450; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 451; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 452; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 453; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 454; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4 455; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6 456; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 457; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7 458; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 459; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7 460; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4 461; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5 462; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 463; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4 464; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3 465; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5 466; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 467; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 468; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 469; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 470; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2 471; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3 472; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5] 473; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3 474; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 475; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4 476; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] 477; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 478; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 479; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 480; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 481; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 482; GISEL-NEXT: s_setpc_b64 s[30:31] 483; 484; CGP-LABEL: v_udiv_v2i32_24bit: 485; CGP: ; %bb.0: 486; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 488; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 489; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 490; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3 491; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 492; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 493; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 494; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 495; CGP-NEXT: v_rcp_f32_e32 v4, v2 496; CGP-NEXT: v_rcp_f32_e32 v5, v3 497; CGP-NEXT: v_mul_f32_e32 v4, v0, v4 498; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 499; CGP-NEXT: v_trunc_f32_e32 v4, v4 500; CGP-NEXT: v_trunc_f32_e32 v5, v5 501; CGP-NEXT: v_fma_f32 v0, -v4, v2, v0 502; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 503; CGP-NEXT: v_fma_f32 v1, -v5, v3, v1 504; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 505; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2 506; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] 507; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3 508; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] 509; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 510; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 511; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 512; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1 513; CGP-NEXT: s_setpc_b64 s[30:31] 514 %num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215> 515 %den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215> 516 %result = udiv <2 x i32> %num.mask, %den.mask 517 ret <2 x i32> %result 518} 519