1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx600 | FileCheck %s --check-prefix=GCN 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global | FileCheck %s --check-prefix=TONGA 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global | FileCheck %s --check-prefix=GFX9 5; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG 6 7; The code generated by sdiv is long and complex and may frequently change. 8; The goal of this test is to make sure the ISel doesn't fail. 9; 10; This program was previously failing to compile when one of the selectcc 11; opcodes generated by the sdiv lowering was being legalized and optimized to: 12; selectcc Remainder -1, 0, -1, SETGT 13; This was fixed by adding an additional pattern in R600Instructions.td to 14; match this pattern with a CNDGE_INT. 15 16define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 17; GCN-LABEL: sdiv_i32: 18; GCN: ; %bb.0: 19; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 20; GCN-NEXT: s_mov_b32 s3, 0xf000 21; GCN-NEXT: s_mov_b32 s2, -1 22; GCN-NEXT: s_mov_b32 s10, s2 23; GCN-NEXT: s_mov_b32 s11, s3 24; GCN-NEXT: s_waitcnt lgkmcnt(0) 25; GCN-NEXT: s_mov_b32 s8, s6 26; GCN-NEXT: s_mov_b32 s9, s7 27; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 28; GCN-NEXT: s_mov_b32 s0, s4 29; GCN-NEXT: s_mov_b32 s1, s5 30; GCN-NEXT: s_waitcnt vmcnt(0) 31; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 32; GCN-NEXT: v_max_i32_e32 v2, v1, v2 33; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 34; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 35; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 36; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 37; GCN-NEXT: v_max_i32_e32 v5, v0, v5 38; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 39; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 40; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 41; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 42; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 43; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 44; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 45; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 46; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 47; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 48; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 49; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2 50; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 51; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 52; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 53; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 54; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 55; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 56; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 57; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 58; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 59; GCN-NEXT: s_endpgm 60; 61; TONGA-LABEL: sdiv_i32: 62; TONGA: ; %bb.0: 63; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 64; TONGA-NEXT: s_mov_b32 s3, 0xf000 65; TONGA-NEXT: s_mov_b32 s2, -1 66; TONGA-NEXT: s_mov_b32 s10, s2 67; TONGA-NEXT: s_mov_b32 s11, s3 68; TONGA-NEXT: s_waitcnt lgkmcnt(0) 69; TONGA-NEXT: s_mov_b32 s8, s6 70; TONGA-NEXT: s_mov_b32 s9, s7 71; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 72; TONGA-NEXT: s_mov_b32 s0, s4 73; TONGA-NEXT: s_mov_b32 s1, s5 74; TONGA-NEXT: s_waitcnt vmcnt(0) 75; TONGA-NEXT: v_sub_u32_e32 v2, vcc, 0, v1 76; TONGA-NEXT: v_max_i32_e32 v2, v1, v2 77; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 78; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 79; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 80; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 81; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 82; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 83; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 84; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 85; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 86; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 87; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 88; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 89; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 90; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 91; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 92; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 93; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2 94; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 95; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 96; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 97; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 98; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 99; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 100; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 101; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 102; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 103; TONGA-NEXT: s_endpgm 104; 105; GFX9-LABEL: sdiv_i32: 106; GFX9: ; %bb.0: 107; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 108; GFX9-NEXT: s_mov_b32 s3, 0xf000 109; GFX9-NEXT: s_mov_b32 s2, -1 110; GFX9-NEXT: s_mov_b32 s6, s2 111; GFX9-NEXT: s_mov_b32 s7, s3 112; GFX9-NEXT: s_waitcnt lgkmcnt(0) 113; GFX9-NEXT: s_mov_b32 s4, s10 114; GFX9-NEXT: s_mov_b32 s5, s11 115; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 116; GFX9-NEXT: s_mov_b32 s0, s8 117; GFX9-NEXT: s_mov_b32 s1, s9 118; GFX9-NEXT: s_waitcnt vmcnt(0) 119; GFX9-NEXT: v_readfirstlane_b32 s4, v1 120; GFX9-NEXT: s_abs_i32 s5, s4 121; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 122; GFX9-NEXT: v_readfirstlane_b32 s6, v0 123; GFX9-NEXT: s_sub_i32 s7, 0, s5 124; GFX9-NEXT: s_xor_b32 s4, s6, s4 125; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 126; GFX9-NEXT: s_abs_i32 s6, s6 127; GFX9-NEXT: s_ashr_i32 s4, s4, 31 128; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 129; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 130; GFX9-NEXT: v_readfirstlane_b32 s8, v0 131; GFX9-NEXT: s_mul_i32 s7, s7, s8 132; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 133; GFX9-NEXT: s_add_i32 s8, s8, s7 134; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 135; GFX9-NEXT: s_mul_i32 s8, s7, s5 136; GFX9-NEXT: s_sub_i32 s6, s6, s8 137; GFX9-NEXT: s_add_i32 s9, s7, 1 138; GFX9-NEXT: s_sub_i32 s8, s6, s5 139; GFX9-NEXT: s_cmp_ge_u32 s6, s5 140; GFX9-NEXT: s_cselect_b32 s7, s9, s7 141; GFX9-NEXT: s_cselect_b32 s6, s8, s6 142; GFX9-NEXT: s_add_i32 s8, s7, 1 143; GFX9-NEXT: s_cmp_ge_u32 s6, s5 144; GFX9-NEXT: s_cselect_b32 s5, s8, s7 145; GFX9-NEXT: s_xor_b32 s5, s5, s4 146; GFX9-NEXT: s_sub_i32 s4, s5, s4 147; GFX9-NEXT: v_mov_b32_e32 v0, s4 148; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 149; GFX9-NEXT: s_endpgm 150; 151; EG-LABEL: sdiv_i32: 152; EG: ; %bb.0: 153; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 154; EG-NEXT: TEX 0 @6 155; EG-NEXT: ALU 26, @9, KC0[CB0:0-32], KC1[] 156; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 157; EG-NEXT: CF_END 158; EG-NEXT: PAD 159; EG-NEXT: Fetch clause starting at 6: 160; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 161; EG-NEXT: ALU clause starting at 8: 162; EG-NEXT: MOV * T0.X, KC0[2].Z, 163; EG-NEXT: ALU clause starting at 9: 164; EG-NEXT: SETGT_INT * T0.W, 0.0, T0.Y, 165; EG-NEXT: ADD_INT * T1.W, T0.Y, PV.W, 166; EG-NEXT: XOR_INT * T1.W, PV.W, T0.W, 167; EG-NEXT: SUB_INT T2.W, 0.0, PV.W, 168; EG-NEXT: RECIP_UINT * T0.Y, PV.W, 169; EG-NEXT: SETGT_INT T3.W, 0.0, T0.X, 170; EG-NEXT: MULLO_INT * T0.Z, PV.W, PS, 171; EG-NEXT: ADD_INT T2.W, T0.X, PV.W, 172; EG-NEXT: MULHI * T0.X, T0.Y, PS, 173; EG-NEXT: ADD_INT T4.W, T0.Y, PS, 174; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, 175; EG-NEXT: MULHI * T0.X, PS, PV.W, 176; EG-NEXT: MULLO_INT * T0.Y, PS, T1.W, 177; EG-NEXT: SUB_INT * T2.W, T2.W, PS, 178; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 179; EG-NEXT: SETGE_UINT T4.W, PV.W, T1.W, 180; EG-NEXT: SUB_INT * T5.W, PV.W, T1.W, 181; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, 182; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, 183; EG-NEXT: ADD_INT T5.W, PS, 1, 184; EG-NEXT: SETGE_UINT * T1.W, PV.W, T1.W, 185; EG-NEXT: CNDE_INT T1.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 186; EG-NEXT: XOR_INT * T0.W, T3.W, T0.W, 187; EG-NEXT: XOR_INT * T1.W, PV.W, PS, 188; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 189; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 190; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 191 %den_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 192 %num = load i32, ptr addrspace(1) %in 193 %den = load i32, ptr addrspace(1) %den_ptr 194 %result = sdiv i32 %num, %den 195 store i32 %result, ptr addrspace(1) %out 196 ret void 197} 198 199define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { 200; GCN-LABEL: sdiv_i32_4: 201; GCN: ; %bb.0: 202; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 203; GCN-NEXT: s_mov_b32 s7, 0xf000 204; GCN-NEXT: s_mov_b32 s6, -1 205; GCN-NEXT: s_mov_b32 s10, s6 206; GCN-NEXT: s_mov_b32 s11, s7 207; GCN-NEXT: s_waitcnt lgkmcnt(0) 208; GCN-NEXT: s_mov_b32 s8, s2 209; GCN-NEXT: s_mov_b32 s9, s3 210; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 211; GCN-NEXT: s_mov_b32 s4, s0 212; GCN-NEXT: s_mov_b32 s5, s1 213; GCN-NEXT: s_waitcnt vmcnt(0) 214; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 215; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1 216; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 217; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 218; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 219; GCN-NEXT: s_endpgm 220; 221; TONGA-LABEL: sdiv_i32_4: 222; TONGA: ; %bb.0: 223; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 224; TONGA-NEXT: s_mov_b32 s7, 0xf000 225; TONGA-NEXT: s_mov_b32 s6, -1 226; TONGA-NEXT: s_mov_b32 s10, s6 227; TONGA-NEXT: s_mov_b32 s11, s7 228; TONGA-NEXT: s_waitcnt lgkmcnt(0) 229; TONGA-NEXT: s_mov_b32 s8, s2 230; TONGA-NEXT: s_mov_b32 s9, s3 231; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 232; TONGA-NEXT: s_mov_b32 s4, s0 233; TONGA-NEXT: s_mov_b32 s5, s1 234; TONGA-NEXT: s_waitcnt vmcnt(0) 235; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 236; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1 237; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 238; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 239; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 240; TONGA-NEXT: s_endpgm 241; 242; GFX9-LABEL: sdiv_i32_4: 243; GFX9: ; %bb.0: 244; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 245; GFX9-NEXT: s_mov_b32 s7, 0xf000 246; GFX9-NEXT: s_mov_b32 s6, -1 247; GFX9-NEXT: s_mov_b32 s10, s6 248; GFX9-NEXT: s_mov_b32 s11, s7 249; GFX9-NEXT: s_waitcnt lgkmcnt(0) 250; GFX9-NEXT: s_mov_b32 s8, s2 251; GFX9-NEXT: s_mov_b32 s9, s3 252; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 253; GFX9-NEXT: s_mov_b32 s4, s0 254; GFX9-NEXT: s_mov_b32 s5, s1 255; GFX9-NEXT: s_waitcnt vmcnt(0) 256; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 257; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1 258; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 259; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 260; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 261; GFX9-NEXT: s_endpgm 262; 263; EG-LABEL: sdiv_i32_4: 264; EG: ; %bb.0: 265; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 266; EG-NEXT: TEX 0 @6 267; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] 268; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 269; EG-NEXT: CF_END 270; EG-NEXT: PAD 271; EG-NEXT: Fetch clause starting at 6: 272; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 273; EG-NEXT: ALU clause starting at 8: 274; EG-NEXT: MOV * T0.X, KC0[2].Z, 275; EG-NEXT: ALU clause starting at 9: 276; EG-NEXT: ASHR * T0.W, T0.X, literal.x, 277; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 278; EG-NEXT: LSHR * T0.W, PV.W, literal.x, 279; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 280; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 281; EG-NEXT: ASHR T0.X, PV.W, literal.x, 282; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 283; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 284 %num = load i32, ptr addrspace(1) %in 285 %result = sdiv i32 %num, 4 286 store i32 %result, ptr addrspace(1) %out 287 ret void 288} 289 290; Multiply by a weird constant to make sure setIntDivIsCheap is 291; working. 292 293define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspace(1) %in) { 294; GCN-LABEL: slow_sdiv_i32_3435: 295; GCN: ; %bb.0: 296; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 297; GCN-NEXT: s_mov_b32 s7, 0xf000 298; GCN-NEXT: s_mov_b32 s6, -1 299; GCN-NEXT: s_mov_b32 s10, s6 300; GCN-NEXT: s_mov_b32 s11, s7 301; GCN-NEXT: s_waitcnt lgkmcnt(0) 302; GCN-NEXT: s_mov_b32 s8, s2 303; GCN-NEXT: s_mov_b32 s9, s3 304; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0 305; GCN-NEXT: s_mov_b32 s2, 0x98a1930b 306; GCN-NEXT: s_mov_b32 s4, s0 307; GCN-NEXT: s_mov_b32 s5, s1 308; GCN-NEXT: s_waitcnt vmcnt(0) 309; GCN-NEXT: v_mul_hi_i32 v1, v0, s2 310; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 311; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 312; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 313; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 314; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 315; GCN-NEXT: s_endpgm 316; 317; TONGA-LABEL: slow_sdiv_i32_3435: 318; TONGA: ; %bb.0: 319; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 320; TONGA-NEXT: s_mov_b32 s7, 0xf000 321; TONGA-NEXT: s_mov_b32 s6, -1 322; TONGA-NEXT: s_mov_b32 s10, s6 323; TONGA-NEXT: s_mov_b32 s11, s7 324; TONGA-NEXT: s_waitcnt lgkmcnt(0) 325; TONGA-NEXT: s_mov_b32 s8, s2 326; TONGA-NEXT: s_mov_b32 s9, s3 327; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0 328; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b 329; TONGA-NEXT: s_mov_b32 s4, s0 330; TONGA-NEXT: s_mov_b32 s5, s1 331; TONGA-NEXT: s_waitcnt vmcnt(0) 332; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2 333; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0 334; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0 335; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0 336; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 337; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 338; TONGA-NEXT: s_endpgm 339; 340; GFX9-LABEL: slow_sdiv_i32_3435: 341; GFX9: ; %bb.0: 342; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 343; GFX9-NEXT: s_mov_b32 s7, 0xf000 344; GFX9-NEXT: s_mov_b32 s6, -1 345; GFX9-NEXT: s_mov_b32 s10, s6 346; GFX9-NEXT: s_mov_b32 s11, s7 347; GFX9-NEXT: s_waitcnt lgkmcnt(0) 348; GFX9-NEXT: s_mov_b32 s8, s2 349; GFX9-NEXT: s_mov_b32 s9, s3 350; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 351; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b 352; GFX9-NEXT: s_mov_b32 s4, s0 353; GFX9-NEXT: s_mov_b32 s5, s1 354; GFX9-NEXT: s_waitcnt vmcnt(0) 355; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2 356; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 357; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0 358; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0 359; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 360; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 361; GFX9-NEXT: s_endpgm 362; 363; EG-LABEL: slow_sdiv_i32_3435: 364; EG: ; %bb.0: 365; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 366; EG-NEXT: TEX 0 @6 367; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] 368; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 369; EG-NEXT: CF_END 370; EG-NEXT: PAD 371; EG-NEXT: Fetch clause starting at 6: 372; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 373; EG-NEXT: ALU clause starting at 8: 374; EG-NEXT: MOV * T0.X, KC0[2].Z, 375; EG-NEXT: ALU clause starting at 9: 376; EG-NEXT: MULHI_INT * T0.Y, T0.X, literal.x, 377; EG-NEXT: -1734241525(-4.176600e-24), 0(0.000000e+00) 378; EG-NEXT: ADD_INT * T0.W, PS, T0.X, 379; EG-NEXT: ASHR T1.W, PV.W, literal.x, 380; EG-NEXT: LSHR * T0.W, PV.W, literal.y, 381; EG-NEXT: 11(1.541428e-44), 31(4.344025e-44) 382; EG-NEXT: ADD_INT T0.X, PV.W, PS, 383; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 384; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 385 %num = load i32, ptr addrspace(1) %in 386 %result = sdiv i32 %num, 3435 387 store i32 %result, ptr addrspace(1) %out 388 ret void 389} 390 391define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 392; GCN-LABEL: sdiv_v2i32: 393; GCN: ; %bb.0: 394; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 395; GCN-NEXT: s_mov_b32 s7, 0xf000 396; GCN-NEXT: s_mov_b32 s6, -1 397; GCN-NEXT: s_mov_b32 s10, s6 398; GCN-NEXT: s_mov_b32 s11, s7 399; GCN-NEXT: s_waitcnt lgkmcnt(0) 400; GCN-NEXT: s_mov_b32 s8, s2 401; GCN-NEXT: s_mov_b32 s9, s3 402; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 403; GCN-NEXT: s_mov_b32 s4, s0 404; GCN-NEXT: s_mov_b32 s5, s1 405; GCN-NEXT: s_waitcnt vmcnt(0) 406; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 407; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 408; GCN-NEXT: v_xor_b32_e32 v4, v0, v2 409; GCN-NEXT: v_xor_b32_e32 v7, v1, v3 410; GCN-NEXT: v_max_i32_e32 v2, v2, v6 411; GCN-NEXT: v_max_i32_e32 v3, v3, v9 412; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 413; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 414; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 415; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 416; GCN-NEXT: v_max_i32_e32 v0, v0, v5 417; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 418; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 419; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 420; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 421; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 422; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 423; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 424; GCN-NEXT: v_mul_lo_u32 v9, v9, v6 425; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 426; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 427; GCN-NEXT: v_mul_hi_u32 v9, v6, v9 428; GCN-NEXT: v_max_i32_e32 v1, v1, v8 429; GCN-NEXT: v_mul_hi_u32 v8, v5, v10 430; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4 431; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 432; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 433; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 434; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 435; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 436; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 437; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 438; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6 439; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 440; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 441; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 442; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 443; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 444; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 445; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] 446; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3 447; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] 448; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] 449; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6 450; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] 451; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 452; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 453; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc 454; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 455; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 456; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 457; GCN-NEXT: v_xor_b32_e32 v1, v1, v7 458; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 459; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 460; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 461; GCN-NEXT: s_endpgm 462; 463; TONGA-LABEL: sdiv_v2i32: 464; TONGA: ; %bb.0: 465; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 466; TONGA-NEXT: s_mov_b32 s7, 0xf000 467; TONGA-NEXT: s_mov_b32 s6, -1 468; TONGA-NEXT: s_mov_b32 s10, s6 469; TONGA-NEXT: s_mov_b32 s11, s7 470; TONGA-NEXT: s_waitcnt lgkmcnt(0) 471; TONGA-NEXT: s_mov_b32 s8, s2 472; TONGA-NEXT: s_mov_b32 s9, s3 473; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 474; TONGA-NEXT: s_mov_b32 s4, s0 475; TONGA-NEXT: s_mov_b32 s5, s1 476; TONGA-NEXT: s_waitcnt vmcnt(0) 477; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 478; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3 479; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2 480; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3 481; TONGA-NEXT: v_max_i32_e32 v2, v2, v6 482; TONGA-NEXT: v_max_i32_e32 v3, v3, v9 483; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 484; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 485; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 486; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 487; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 488; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 489; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 490; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 491; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 492; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 493; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 494; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3 495; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6 496; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 497; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1 498; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9 499; TONGA-NEXT: v_max_i32_e32 v1, v1, v8 500; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10 501; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4 502; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9 503; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8 504; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6 505; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 506; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7 507; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2 508; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 509; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6 510; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 511; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 512; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 513; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 514; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 515; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 516; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] 517; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 518; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] 519; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] 520; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6 521; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] 522; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5 523; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 524; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc 525; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 526; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc 527; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 528; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 529; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 530; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7 531; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 532; TONGA-NEXT: s_endpgm 533; 534; GFX9-LABEL: sdiv_v2i32: 535; GFX9: ; %bb.0: 536; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 537; GFX9-NEXT: s_mov_b32 s3, 0xf000 538; GFX9-NEXT: s_mov_b32 s2, -1 539; GFX9-NEXT: s_mov_b32 s6, s2 540; GFX9-NEXT: s_mov_b32 s7, s3 541; GFX9-NEXT: s_waitcnt lgkmcnt(0) 542; GFX9-NEXT: s_mov_b32 s4, s10 543; GFX9-NEXT: s_mov_b32 s5, s11 544; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 545; GFX9-NEXT: s_waitcnt vmcnt(0) 546; GFX9-NEXT: v_readfirstlane_b32 s0, v2 547; GFX9-NEXT: s_abs_i32 s1, s0 548; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 549; GFX9-NEXT: v_readfirstlane_b32 s5, v0 550; GFX9-NEXT: s_xor_b32 s0, s5, s0 551; GFX9-NEXT: s_ashr_i32 s6, s0, 31 552; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 553; GFX9-NEXT: s_sub_i32 s0, 0, s1 554; GFX9-NEXT: s_abs_i32 s5, s5 555; GFX9-NEXT: v_readfirstlane_b32 s4, v3 556; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 557; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 558; GFX9-NEXT: v_readfirstlane_b32 s7, v0 559; GFX9-NEXT: s_mul_i32 s0, s0, s7 560; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 561; GFX9-NEXT: s_add_i32 s7, s7, s0 562; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 563; GFX9-NEXT: s_mul_i32 s7, s0, s1 564; GFX9-NEXT: s_sub_i32 s5, s5, s7 565; GFX9-NEXT: s_add_i32 s10, s0, 1 566; GFX9-NEXT: s_sub_i32 s7, s5, s1 567; GFX9-NEXT: s_cmp_ge_u32 s5, s1 568; GFX9-NEXT: s_cselect_b32 s0, s10, s0 569; GFX9-NEXT: s_cselect_b32 s5, s7, s5 570; GFX9-NEXT: s_add_i32 s7, s0, 1 571; GFX9-NEXT: s_cmp_ge_u32 s5, s1 572; GFX9-NEXT: s_cselect_b32 s5, s7, s0 573; GFX9-NEXT: s_abs_i32 s7, s4 574; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 575; GFX9-NEXT: s_xor_b32 s5, s5, s6 576; GFX9-NEXT: s_mov_b32 s1, s9 577; GFX9-NEXT: s_sub_i32 s9, 0, s7 578; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 579; GFX9-NEXT: s_sub_i32 s5, s5, s6 580; GFX9-NEXT: s_mov_b32 s0, s8 581; GFX9-NEXT: v_readfirstlane_b32 s8, v1 582; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 583; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 584; GFX9-NEXT: s_xor_b32 s4, s8, s4 585; GFX9-NEXT: s_abs_i32 s8, s8 586; GFX9-NEXT: s_ashr_i32 s4, s4, 31 587; GFX9-NEXT: v_readfirstlane_b32 s6, v0 588; GFX9-NEXT: s_mul_i32 s9, s9, s6 589; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9 590; GFX9-NEXT: s_add_i32 s6, s6, s9 591; GFX9-NEXT: s_mul_hi_u32 s6, s8, s6 592; GFX9-NEXT: s_mul_i32 s9, s6, s7 593; GFX9-NEXT: s_sub_i32 s8, s8, s9 594; GFX9-NEXT: s_add_i32 s10, s6, 1 595; GFX9-NEXT: s_sub_i32 s9, s8, s7 596; GFX9-NEXT: s_cmp_ge_u32 s8, s7 597; GFX9-NEXT: s_cselect_b32 s6, s10, s6 598; GFX9-NEXT: s_cselect_b32 s8, s9, s8 599; GFX9-NEXT: s_add_i32 s9, s6, 1 600; GFX9-NEXT: s_cmp_ge_u32 s8, s7 601; GFX9-NEXT: s_cselect_b32 s6, s9, s6 602; GFX9-NEXT: s_xor_b32 s6, s6, s4 603; GFX9-NEXT: s_sub_i32 s4, s6, s4 604; GFX9-NEXT: v_mov_b32_e32 v0, s5 605; GFX9-NEXT: v_mov_b32_e32 v1, s4 606; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 607; GFX9-NEXT: s_endpgm 608; 609; EG-LABEL: sdiv_v2i32: 610; EG: ; %bb.0: 611; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 612; EG-NEXT: TEX 0 @6 613; EG-NEXT: ALU 51, @9, KC0[CB0:0-32], KC1[] 614; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 615; EG-NEXT: CF_END 616; EG-NEXT: PAD 617; EG-NEXT: Fetch clause starting at 6: 618; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 619; EG-NEXT: ALU clause starting at 8: 620; EG-NEXT: MOV * T0.X, KC0[2].Z, 621; EG-NEXT: ALU clause starting at 9: 622; EG-NEXT: SETGT_INT * T1.W, 0.0, T0.W, 623; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, 624; EG-NEXT: SETGT_INT * T2.W, 0.0, T0.Z, 625; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, 626; EG-NEXT: SUB_INT T1.Z, 0.0, PV.W, 627; EG-NEXT: ADD_INT T3.W, T0.Z, T2.W, 628; EG-NEXT: RECIP_UINT * T0.Z, PV.W, 629; EG-NEXT: XOR_INT T3.W, PV.W, T2.W, 630; EG-NEXT: MULLO_INT * T1.X, PV.Z, PS, 631; EG-NEXT: SUB_INT T4.W, 0.0, PV.W, 632; EG-NEXT: RECIP_UINT * T1.Y, PV.W, 633; EG-NEXT: SETGT_INT T5.W, 0.0, T0.X, 634; EG-NEXT: MULLO_INT * T1.Z, PV.W, PS, 635; EG-NEXT: SETGT_INT T2.Z, 0.0, T0.Y, 636; EG-NEXT: ADD_INT T4.W, T0.X, PV.W, 637; EG-NEXT: MULHI * T0.X, T1.Y, PS, 638; EG-NEXT: ADD_INT T1.Y, T1.Y, PS, 639; EG-NEXT: XOR_INT T1.Z, PV.W, T5.W, 640; EG-NEXT: ADD_INT T4.W, T0.Y, PV.Z, BS:VEC_120/SCL_212 641; EG-NEXT: MULHI * T0.X, T0.Z, T1.X, 642; EG-NEXT: ADD_INT T0.Z, T0.Z, PS, 643; EG-NEXT: XOR_INT T4.W, PV.W, T2.Z, 644; EG-NEXT: MULHI * T0.X, PV.Z, PV.Y, 645; EG-NEXT: MULHI * T0.Y, PV.W, PV.Z, 646; EG-NEXT: MULLO_INT * T0.Z, PS, T0.W, 647; EG-NEXT: SUB_INT T4.W, T4.W, PS, 648; EG-NEXT: MULLO_INT * T0.Z, T0.X, T3.W, 649; EG-NEXT: SUB_INT T1.Y, T1.Z, PS, 650; EG-NEXT: ADD_INT T0.Z, T0.Y, 1, 651; EG-NEXT: SETGE_UINT T6.W, PV.W, T0.W, 652; EG-NEXT: SUB_INT * T7.W, PV.W, T0.W, 653; EG-NEXT: CNDE_INT T1.X, PV.W, T4.W, PS, BS:VEC_021/SCL_122 654; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PV.Z, 655; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 656; EG-NEXT: SETGE_UINT T4.W, PV.Y, T3.W, 657; EG-NEXT: SUB_INT * T6.W, PV.Y, T3.W, 658; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PS, 659; EG-NEXT: CNDE_INT T0.Z, PV.W, T0.X, PV.Z, 660; EG-NEXT: ADD_INT T4.W, PV.Y, 1, 661; EG-NEXT: SETGE_UINT * T0.W, PV.X, T0.W, 662; EG-NEXT: CNDE_INT T0.Y, PS, T0.Y, PV.W, 663; EG-NEXT: XOR_INT T1.Z, T2.Z, T1.W, BS:VEC_021/SCL_122 664; EG-NEXT: ADD_INT T0.W, PV.Z, 1, 665; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T3.W, 666; EG-NEXT: CNDE_INT T0.Z, PS, T0.Z, PV.W, 667; EG-NEXT: XOR_INT T0.W, T5.W, T2.W, 668; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.Z, 669; EG-NEXT: SUB_INT T0.Y, PS, T1.Z, 670; EG-NEXT: XOR_INT * T1.W, PV.Z, PV.W, 671; EG-NEXT: SUB_INT T0.X, PV.W, T0.W, 672; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 673; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 674 %den_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 675 %num = load <2 x i32>, ptr addrspace(1) %in 676 %den = load <2 x i32>, ptr addrspace(1) %den_ptr 677 %result = sdiv <2 x i32> %num, %den 678 store <2 x i32> %result, ptr addrspace(1) %out 679 ret void 680} 681 682define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { 683; GCN-LABEL: sdiv_v2i32_4: 684; GCN: ; %bb.0: 685; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 686; GCN-NEXT: s_mov_b32 s7, 0xf000 687; GCN-NEXT: s_mov_b32 s6, -1 688; GCN-NEXT: s_mov_b32 s10, s6 689; GCN-NEXT: s_mov_b32 s11, s7 690; GCN-NEXT: s_waitcnt lgkmcnt(0) 691; GCN-NEXT: s_mov_b32 s8, s2 692; GCN-NEXT: s_mov_b32 s9, s3 693; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 694; GCN-NEXT: s_mov_b32 s4, s0 695; GCN-NEXT: s_mov_b32 s5, s1 696; GCN-NEXT: s_waitcnt vmcnt(0) 697; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0 698; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1 699; GCN-NEXT: v_lshrrev_b32_e32 v2, 30, v2 700; GCN-NEXT: v_lshrrev_b32_e32 v3, 30, v3 701; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 702; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 703; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 704; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 705; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 706; GCN-NEXT: s_endpgm 707; 708; TONGA-LABEL: sdiv_v2i32_4: 709; TONGA: ; %bb.0: 710; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 711; TONGA-NEXT: s_mov_b32 s7, 0xf000 712; TONGA-NEXT: s_mov_b32 s6, -1 713; TONGA-NEXT: s_mov_b32 s10, s6 714; TONGA-NEXT: s_mov_b32 s11, s7 715; TONGA-NEXT: s_waitcnt lgkmcnt(0) 716; TONGA-NEXT: s_mov_b32 s8, s2 717; TONGA-NEXT: s_mov_b32 s9, s3 718; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 719; TONGA-NEXT: s_mov_b32 s4, s0 720; TONGA-NEXT: s_mov_b32 s5, s1 721; TONGA-NEXT: s_waitcnt vmcnt(0) 722; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0 723; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1 724; TONGA-NEXT: v_lshrrev_b32_e32 v2, 30, v2 725; TONGA-NEXT: v_lshrrev_b32_e32 v3, 30, v3 726; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 727; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 728; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 729; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 730; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 731; TONGA-NEXT: s_endpgm 732; 733; GFX9-LABEL: sdiv_v2i32_4: 734; GFX9: ; %bb.0: 735; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 736; GFX9-NEXT: s_mov_b32 s7, 0xf000 737; GFX9-NEXT: s_mov_b32 s6, -1 738; GFX9-NEXT: s_mov_b32 s10, s6 739; GFX9-NEXT: s_mov_b32 s11, s7 740; GFX9-NEXT: s_waitcnt lgkmcnt(0) 741; GFX9-NEXT: s_mov_b32 s8, s2 742; GFX9-NEXT: s_mov_b32 s9, s3 743; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 744; GFX9-NEXT: s_mov_b32 s4, s0 745; GFX9-NEXT: s_mov_b32 s5, s1 746; GFX9-NEXT: s_waitcnt vmcnt(0) 747; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0 748; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1 749; GFX9-NEXT: v_lshrrev_b32_e32 v2, 30, v2 750; GFX9-NEXT: v_lshrrev_b32_e32 v3, 30, v3 751; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 752; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 753; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 754; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 755; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 756; GFX9-NEXT: s_endpgm 757; 758; EG-LABEL: sdiv_v2i32_4: 759; EG: ; %bb.0: 760; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 761; EG-NEXT: TEX 0 @6 762; EG-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[] 763; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 764; EG-NEXT: CF_END 765; EG-NEXT: PAD 766; EG-NEXT: Fetch clause starting at 6: 767; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 768; EG-NEXT: ALU clause starting at 8: 769; EG-NEXT: MOV * T0.X, KC0[2].Z, 770; EG-NEXT: ALU clause starting at 9: 771; EG-NEXT: ASHR * T0.W, T0.Y, literal.x, 772; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 773; EG-NEXT: LSHR T0.W, PV.W, literal.x, 774; EG-NEXT: ASHR * T1.W, T0.X, literal.y, 775; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 776; EG-NEXT: LSHR T1.W, PS, literal.x, 777; EG-NEXT: ADD_INT * T0.W, T0.Y, PV.W, 778; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 779; EG-NEXT: ASHR T0.Y, PS, literal.x, 780; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 781; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 782; EG-NEXT: ASHR T0.X, PV.W, literal.x, 783; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 784; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 785 %num = load <2 x i32>, ptr addrspace(1) %in 786 %result = sdiv <2 x i32> %num, <i32 4, i32 4> 787 store <2 x i32> %result, ptr addrspace(1) %out 788 ret void 789} 790 791define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 792; GCN-LABEL: sdiv_v4i32: 793; GCN: ; %bb.0: 794; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 795; GCN-NEXT: s_mov_b32 s7, 0xf000 796; GCN-NEXT: s_mov_b32 s6, -1 797; GCN-NEXT: s_mov_b32 s10, s6 798; GCN-NEXT: s_mov_b32 s11, s7 799; GCN-NEXT: s_waitcnt lgkmcnt(0) 800; GCN-NEXT: s_mov_b32 s8, s2 801; GCN-NEXT: s_mov_b32 s9, s3 802; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 803; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 804; GCN-NEXT: s_mov_b32 s4, s0 805; GCN-NEXT: s_mov_b32 s5, s1 806; GCN-NEXT: s_waitcnt vmcnt(1) 807; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v1 808; GCN-NEXT: s_waitcnt vmcnt(0) 809; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v5 810; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 811; GCN-NEXT: v_max_i32_e32 v5, v5, v12 812; GCN-NEXT: v_cvt_f32_u32_e32 v12, v5 813; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 814; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 815; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 816; GCN-NEXT: v_max_i32_e32 v4, v4, v10 817; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 818; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 819; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 820; GCN-NEXT: v_cvt_f32_u32_e32 v12, v4 821; GCN-NEXT: v_max_i32_e32 v1, v1, v13 822; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 823; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 824; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 825; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 826; GCN-NEXT: v_max_i32_e32 v6, v6, v15 827; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 828; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 829; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 830; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 831; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 832; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 833; GCN-NEXT: v_mul_lo_u32 v16, v16, v12 834; GCN-NEXT: v_mul_hi_u32 v10, v1, v10 835; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 836; GCN-NEXT: v_mul_hi_u32 v13, v12, v16 837; GCN-NEXT: v_max_i32_e32 v0, v0, v9 838; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 839; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 840; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 841; GCN-NEXT: v_mul_lo_u32 v13, v10, v5 842; GCN-NEXT: v_mul_hi_u32 v12, v0, v12 843; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 844; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 845; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v13 846; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 847; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 848; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] 849; GCN-NEXT: v_sub_i32_e32 v13, vcc, v1, v5 850; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] 851; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 852; GCN-NEXT: v_mul_lo_u32 v1, v12, v4 853; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v6 854; GCN-NEXT: v_mul_lo_u32 v5, v5, v9 855; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 856; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 857; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 858; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] 859; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v4 860; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] 861; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 862; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 863; GCN-NEXT: v_mul_hi_u32 v4, v9, v5 864; GCN-NEXT: v_max_i32_e32 v5, v7, v0 865; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 866; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 867; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 868; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 869; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 870; GCN-NEXT: v_max_i32_e32 v2, v2, v9 871; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 872; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 873; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 874; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] 875; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 876; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 877; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 878; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 879; GCN-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] 880; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 881; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 882; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 883; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 884; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 885; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 886; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 887; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] 888; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 889; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 890; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] 891; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 892; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 893; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc 894; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 895; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 896; GCN-NEXT: v_max_i32_e32 v6, v3, v6 897; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 898; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 899; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 900; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 901; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 902; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 903; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 904; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 905; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 906; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 907; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 908; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 909; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 910; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 911; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 912; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 913; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 914; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 915; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 916; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 917; GCN-NEXT: s_endpgm 918; 919; TONGA-LABEL: sdiv_v4i32: 920; TONGA: ; %bb.0: 921; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 922; TONGA-NEXT: s_mov_b32 s7, 0xf000 923; TONGA-NEXT: s_mov_b32 s6, -1 924; TONGA-NEXT: s_mov_b32 s10, s6 925; TONGA-NEXT: s_mov_b32 s11, s7 926; TONGA-NEXT: s_waitcnt lgkmcnt(0) 927; TONGA-NEXT: s_mov_b32 s8, s2 928; TONGA-NEXT: s_mov_b32 s9, s3 929; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 930; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 931; TONGA-NEXT: s_mov_b32 s4, s0 932; TONGA-NEXT: s_mov_b32 s5, s1 933; TONGA-NEXT: s_waitcnt vmcnt(1) 934; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 935; TONGA-NEXT: s_waitcnt vmcnt(0) 936; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v5 937; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 938; TONGA-NEXT: v_max_i32_e32 v5, v5, v12 939; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v5 940; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 941; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 942; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 943; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 944; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 945; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v12 946; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 947; TONGA-NEXT: v_cvt_f32_u32_e32 v12, v4 948; TONGA-NEXT: v_max_i32_e32 v1, v1, v13 949; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 950; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 951; TONGA-NEXT: v_rcp_iflag_f32_e32 v12, v12 952; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 953; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 954; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 955; TONGA-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 956; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v12 957; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 958; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 959; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 960; TONGA-NEXT: v_mul_lo_u32 v16, v16, v12 961; TONGA-NEXT: v_mul_hi_u32 v10, v1, v10 962; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 963; TONGA-NEXT: v_mul_hi_u32 v13, v12, v16 964; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 965; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 966; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 967; TONGA-NEXT: v_add_u32_e32 v12, vcc, v12, v13 968; TONGA-NEXT: v_mul_lo_u32 v13, v10, v5 969; TONGA-NEXT: v_mul_hi_u32 v12, v0, v12 970; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 971; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 972; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v13 973; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 974; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 975; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] 976; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v1, v5 977; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1] 978; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v1, v5 979; TONGA-NEXT: v_mul_lo_u32 v1, v12, v4 980; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v6 981; TONGA-NEXT: v_mul_lo_u32 v5, v5, v9 982; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 983; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 984; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 985; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] 986; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v4 987; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] 988; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v4 989; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 990; TONGA-NEXT: v_mul_hi_u32 v4, v9, v5 991; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 992; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 993; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 994; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 995; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 996; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 997; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 998; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 999; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1000; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 1001; TONGA-NEXT: v_cndmask_b32_e64 v0, v1, v12, s[2:3] 1002; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 1003; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 1004; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 1005; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 1006; TONGA-NEXT: v_cndmask_b32_e64 v1, v10, v13, s[0:1] 1007; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 1008; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 1009; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 1010; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 1011; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 1012; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 1013; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 1014; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] 1015; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 1016; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 1017; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] 1018; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 1019; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 1020; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc 1021; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 1022; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 1023; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 1024; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 1025; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 1026; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 1027; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 1028; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 1029; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 1030; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 1031; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 1032; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 1033; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 1034; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 1035; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 1036; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc 1037; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 1038; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 1039; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 1040; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 1041; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 1042; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 1043; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1044; TONGA-NEXT: s_endpgm 1045; 1046; GFX9-LABEL: sdiv_v4i32: 1047; GFX9: ; %bb.0: 1048; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1049; GFX9-NEXT: s_mov_b32 s3, 0xf000 1050; GFX9-NEXT: s_mov_b32 s2, -1 1051; GFX9-NEXT: s_mov_b32 s6, s2 1052; GFX9-NEXT: s_mov_b32 s7, s3 1053; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1054; GFX9-NEXT: s_mov_b32 s4, s10 1055; GFX9-NEXT: s_mov_b32 s5, s11 1056; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 1057; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 1058; GFX9-NEXT: s_waitcnt vmcnt(1) 1059; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1060; GFX9-NEXT: s_abs_i32 s1, s0 1061; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 1062; GFX9-NEXT: s_waitcnt vmcnt(0) 1063; GFX9-NEXT: v_readfirstlane_b32 s5, v4 1064; GFX9-NEXT: s_xor_b32 s0, s5, s0 1065; GFX9-NEXT: s_ashr_i32 s6, s0, 31 1066; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1067; GFX9-NEXT: s_sub_i32 s0, 0, s1 1068; GFX9-NEXT: s_abs_i32 s5, s5 1069; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1070; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1071; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1072; GFX9-NEXT: v_readfirstlane_b32 s7, v0 1073; GFX9-NEXT: s_mul_i32 s0, s0, s7 1074; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 1075; GFX9-NEXT: s_add_i32 s7, s7, s0 1076; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 1077; GFX9-NEXT: s_mul_i32 s7, s0, s1 1078; GFX9-NEXT: s_sub_i32 s5, s5, s7 1079; GFX9-NEXT: s_add_i32 s10, s0, 1 1080; GFX9-NEXT: s_sub_i32 s7, s5, s1 1081; GFX9-NEXT: s_cmp_ge_u32 s5, s1 1082; GFX9-NEXT: s_cselect_b32 s0, s10, s0 1083; GFX9-NEXT: s_cselect_b32 s5, s7, s5 1084; GFX9-NEXT: s_add_i32 s7, s0, 1 1085; GFX9-NEXT: s_cmp_ge_u32 s5, s1 1086; GFX9-NEXT: s_cselect_b32 s1, s7, s0 1087; GFX9-NEXT: s_abs_i32 s5, s4 1088; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 1089; GFX9-NEXT: s_xor_b32 s1, s1, s6 1090; GFX9-NEXT: s_sub_i32 s10, 0, s5 1091; GFX9-NEXT: s_sub_i32 s6, s1, s6 1092; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1093; GFX9-NEXT: s_mov_b32 s0, s8 1094; GFX9-NEXT: v_readfirstlane_b32 s8, v5 1095; GFX9-NEXT: s_xor_b32 s4, s8, s4 1096; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1097; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1098; GFX9-NEXT: s_abs_i32 s8, s8 1099; GFX9-NEXT: s_ashr_i32 s4, s4, 31 1100; GFX9-NEXT: v_readfirstlane_b32 s7, v2 1101; GFX9-NEXT: v_readfirstlane_b32 s1, v0 1102; GFX9-NEXT: s_mul_i32 s10, s10, s1 1103; GFX9-NEXT: s_mul_hi_u32 s10, s1, s10 1104; GFX9-NEXT: s_add_i32 s1, s1, s10 1105; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1 1106; GFX9-NEXT: s_mul_i32 s10, s1, s5 1107; GFX9-NEXT: s_sub_i32 s8, s8, s10 1108; GFX9-NEXT: s_add_i32 s11, s1, 1 1109; GFX9-NEXT: s_sub_i32 s10, s8, s5 1110; GFX9-NEXT: s_cmp_ge_u32 s8, s5 1111; GFX9-NEXT: s_cselect_b32 s1, s11, s1 1112; GFX9-NEXT: s_cselect_b32 s8, s10, s8 1113; GFX9-NEXT: s_add_i32 s10, s1, 1 1114; GFX9-NEXT: s_cmp_ge_u32 s8, s5 1115; GFX9-NEXT: s_cselect_b32 s5, s10, s1 1116; GFX9-NEXT: s_abs_i32 s8, s7 1117; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1118; GFX9-NEXT: s_xor_b32 s5, s5, s4 1119; GFX9-NEXT: s_sub_i32 s11, 0, s8 1120; GFX9-NEXT: s_sub_i32 s4, s5, s4 1121; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1122; GFX9-NEXT: v_readfirstlane_b32 s10, v6 1123; GFX9-NEXT: s_xor_b32 s7, s10, s7 1124; GFX9-NEXT: s_abs_i32 s10, s10 1125; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1126; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1127; GFX9-NEXT: s_ashr_i32 s7, s7, 31 1128; GFX9-NEXT: s_mov_b32 s1, s9 1129; GFX9-NEXT: v_readfirstlane_b32 s9, v3 1130; GFX9-NEXT: v_readfirstlane_b32 s5, v0 1131; GFX9-NEXT: s_mul_i32 s11, s11, s5 1132; GFX9-NEXT: s_mul_hi_u32 s11, s5, s11 1133; GFX9-NEXT: s_add_i32 s5, s5, s11 1134; GFX9-NEXT: s_mul_hi_u32 s5, s10, s5 1135; GFX9-NEXT: s_mul_i32 s11, s5, s8 1136; GFX9-NEXT: s_sub_i32 s10, s10, s11 1137; GFX9-NEXT: s_add_i32 s12, s5, 1 1138; GFX9-NEXT: s_sub_i32 s11, s10, s8 1139; GFX9-NEXT: s_cmp_ge_u32 s10, s8 1140; GFX9-NEXT: s_cselect_b32 s5, s12, s5 1141; GFX9-NEXT: s_cselect_b32 s10, s11, s10 1142; GFX9-NEXT: s_add_i32 s11, s5, 1 1143; GFX9-NEXT: s_cmp_ge_u32 s10, s8 1144; GFX9-NEXT: s_cselect_b32 s5, s11, s5 1145; GFX9-NEXT: s_abs_i32 s8, s9 1146; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 1147; GFX9-NEXT: v_readfirstlane_b32 s10, v7 1148; GFX9-NEXT: s_xor_b32 s5, s5, s7 1149; GFX9-NEXT: v_mov_b32_e32 v1, s4 1150; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1151; GFX9-NEXT: s_xor_b32 s4, s10, s9 1152; GFX9-NEXT: s_sub_i32 s9, 0, s8 1153; GFX9-NEXT: s_sub_i32 s5, s5, s7 1154; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1155; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1156; GFX9-NEXT: v_mov_b32_e32 v0, s6 1157; GFX9-NEXT: s_abs_i32 s6, s10 1158; GFX9-NEXT: s_ashr_i32 s4, s4, 31 1159; GFX9-NEXT: v_readfirstlane_b32 s7, v2 1160; GFX9-NEXT: s_mul_i32 s9, s9, s7 1161; GFX9-NEXT: s_mul_hi_u32 s9, s7, s9 1162; GFX9-NEXT: s_add_i32 s7, s7, s9 1163; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7 1164; GFX9-NEXT: s_mul_i32 s9, s7, s8 1165; GFX9-NEXT: s_sub_i32 s6, s6, s9 1166; GFX9-NEXT: s_add_i32 s10, s7, 1 1167; GFX9-NEXT: s_sub_i32 s9, s6, s8 1168; GFX9-NEXT: s_cmp_ge_u32 s6, s8 1169; GFX9-NEXT: s_cselect_b32 s7, s10, s7 1170; GFX9-NEXT: s_cselect_b32 s6, s9, s6 1171; GFX9-NEXT: s_add_i32 s9, s7, 1 1172; GFX9-NEXT: s_cmp_ge_u32 s6, s8 1173; GFX9-NEXT: s_cselect_b32 s6, s9, s7 1174; GFX9-NEXT: s_xor_b32 s6, s6, s4 1175; GFX9-NEXT: s_sub_i32 s4, s6, s4 1176; GFX9-NEXT: v_mov_b32_e32 v2, s5 1177; GFX9-NEXT: v_mov_b32_e32 v3, s4 1178; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1179; GFX9-NEXT: s_endpgm 1180; 1181; EG-LABEL: sdiv_v4i32: 1182; EG: ; %bb.0: 1183; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1184; EG-NEXT: TEX 1 @6 1185; EG-NEXT: ALU 101, @11, KC0[CB0:0-32], KC1[] 1186; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 1187; EG-NEXT: CF_END 1188; EG-NEXT: PAD 1189; EG-NEXT: Fetch clause starting at 6: 1190; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 1191; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1192; EG-NEXT: ALU clause starting at 10: 1193; EG-NEXT: MOV * T0.X, KC0[2].Z, 1194; EG-NEXT: ALU clause starting at 11: 1195; EG-NEXT: SETGT_INT * T2.W, 0.0, T1.W, 1196; EG-NEXT: ADD_INT * T1.W, T1.W, PV.W, 1197; EG-NEXT: XOR_INT * T1.W, PV.W, T2.W, 1198; EG-NEXT: SUB_INT T3.W, 0.0, PV.W, 1199; EG-NEXT: RECIP_UINT * T2.X, PV.W, 1200; EG-NEXT: SETGT_INT T4.W, 0.0, T0.W, 1201; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1202; EG-NEXT: SETGT_INT T2.Z, 0.0, T1.Y, 1203; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, 1204; EG-NEXT: MULHI * T2.Y, T2.X, PS, 1205; EG-NEXT: ADD_INT T3.Z, T2.X, PS, 1206; EG-NEXT: XOR_INT T0.W, PV.W, T4.W, 1207; EG-NEXT: ADD_INT * T3.W, T1.Y, PV.Z, 1208; EG-NEXT: XOR_INT T3.W, PS, T2.Z, 1209; EG-NEXT: MULHI * T1.Y, PV.W, PV.Z, 1210; EG-NEXT: SUB_INT T5.W, 0.0, PV.W, 1211; EG-NEXT: RECIP_UINT * T2.X, PV.W, 1212; EG-NEXT: SETGT_INT T6.W, 0.0, T0.Y, 1213; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1214; EG-NEXT: ADD_INT T5.W, T0.Y, PV.W, 1215; EG-NEXT: MULHI * T0.Y, T2.X, PS, 1216; EG-NEXT: ADD_INT T0.Y, T2.X, PS, 1217; EG-NEXT: XOR_INT T3.Z, PV.W, T6.W, BS:VEC_021/SCL_122 1218; EG-NEXT: SETGT_INT T5.W, 0.0, T1.Z, 1219; EG-NEXT: MULLO_INT * T2.X, T1.Y, T1.W, 1220; EG-NEXT: ADD_INT T7.W, T1.Z, PV.W, 1221; EG-NEXT: MULHI * T0.Y, PV.Z, PV.Y, 1222; EG-NEXT: XOR_INT T7.W, PV.W, T5.W, BS:VEC_021/SCL_122 1223; EG-NEXT: MULLO_INT * T1.Z, PS, T3.W, 1224; EG-NEXT: SUB_INT T4.Z, 0.0, PV.W, 1225; EG-NEXT: SETGT_INT T8.W, 0.0, T1.X, 1226; EG-NEXT: RECIP_UINT * T2.Y, PV.W, 1227; EG-NEXT: ADD_INT T9.W, T1.X, PV.W, 1228; EG-NEXT: MULLO_INT * T1.X, PV.Z, PS, 1229; EG-NEXT: SETGT_INT T4.Z, 0.0, T0.Z, 1230; EG-NEXT: XOR_INT T9.W, PV.W, T8.W, 1231; EG-NEXT: MULHI * T1.X, T2.Y, PS, 1232; EG-NEXT: ADD_INT T1.X, T2.Y, PS, 1233; EG-NEXT: SUB_INT T2.Y, 0.0, PV.W, 1234; EG-NEXT: SUB_INT T1.Z, T3.Z, T1.Z, 1235; EG-NEXT: ADD_INT T10.W, T0.Z, PV.Z, BS:VEC_201 1236; EG-NEXT: RECIP_UINT * T0.Z, PV.W, 1237; EG-NEXT: XOR_INT T3.X, PV.W, T4.Z, 1238; EG-NEXT: ADD_INT T3.Y, T0.Y, 1, 1239; EG-NEXT: SETGE_UINT T3.Z, PV.Z, T3.W, 1240; EG-NEXT: SUB_INT T10.W, PV.Z, T3.W, 1241; EG-NEXT: MULLO_INT * T2.Y, PV.Y, PS, 1242; EG-NEXT: CNDE_INT T1.Z, PV.Z, T1.Z, PV.W, 1243; EG-NEXT: CNDE_INT T10.W, PV.Z, T0.Y, PV.Y, 1244; EG-NEXT: MULHI * T0.Y, PV.X, T1.X, 1245; EG-NEXT: SETGT_INT T3.Y, 0.0, T0.X, 1246; EG-NEXT: ADD_INT T3.Z, PV.W, 1, 1247; EG-NEXT: SETGE_UINT T3.W, PV.Z, T3.W, BS:VEC_021/SCL_122 1248; EG-NEXT: MULLO_INT * T1.X, PS, T7.W, 1249; EG-NEXT: CNDE_INT T4.Y, PV.W, T10.W, PV.Z, 1250; EG-NEXT: ADD_INT T1.Z, T0.X, PV.Y, 1251; EG-NEXT: SUB_INT T3.W, T3.X, PS, BS:VEC_120/SCL_212 1252; EG-NEXT: MULHI * T0.X, T0.Z, T2.Y, 1253; EG-NEXT: ADD_INT T1.X, T0.Y, 1, 1254; EG-NEXT: SETGE_UINT T2.Y, PV.W, T7.W, 1255; EG-NEXT: ADD_INT T0.Z, T0.Z, PS, 1256; EG-NEXT: XOR_INT T10.W, PV.Z, T3.Y, 1257; EG-NEXT: SUB_INT * T0.W, T0.W, T2.X, 1258; EG-NEXT: SUB_INT T0.X, T3.W, T7.W, 1259; EG-NEXT: ADD_INT T5.Y, T1.Y, 1, 1260; EG-NEXT: SETGE_UINT T1.Z, PS, T1.W, BS:VEC_021/SCL_122 1261; EG-NEXT: SUB_INT T11.W, PS, T1.W, BS:VEC_021/SCL_122 1262; EG-NEXT: MULHI * T0.Z, PV.W, PV.Z, 1263; EG-NEXT: CNDE_INT T2.X, PV.Z, T0.W, PV.W, BS:VEC_021/SCL_122 1264; EG-NEXT: CNDE_INT T1.Y, PV.Z, T1.Y, PV.Y, 1265; EG-NEXT: CNDE_INT T1.Z, T2.Y, T3.W, PV.X, BS:VEC_201 1266; EG-NEXT: CNDE_INT T0.W, T2.Y, T0.Y, T1.X, BS:VEC_201 1267; EG-NEXT: MULLO_INT * T0.X, PS, T9.W, 1268; EG-NEXT: ADD_INT T1.X, PV.W, 1, 1269; EG-NEXT: SETGE_UINT T0.Y, PV.Z, T7.W, 1270; EG-NEXT: ADD_INT T1.Z, PV.Y, 1, 1271; EG-NEXT: SETGE_UINT T1.W, PV.X, T1.W, BS:VEC_102/SCL_221 1272; EG-NEXT: SUB_INT * T3.W, T10.W, PS, 1273; EG-NEXT: ADD_INT T0.X, T0.Z, 1, 1274; EG-NEXT: SETGE_UINT T2.Y, PS, T9.W, BS:VEC_102/SCL_221 1275; EG-NEXT: SUB_INT T3.Z, PS, T9.W, BS:VEC_102/SCL_221 1276; EG-NEXT: CNDE_INT T1.W, PV.W, T1.Y, PV.Z, 1277; EG-NEXT: XOR_INT * T2.W, T4.W, T2.W, 1278; EG-NEXT: XOR_INT T2.X, PV.W, PS, 1279; EG-NEXT: CNDE_INT T1.Y, PV.Y, T3.W, PV.Z, BS:VEC_021/SCL_122 1280; EG-NEXT: CNDE_INT T0.Z, PV.Y, T0.Z, PV.X, 1281; EG-NEXT: CNDE_INT T0.W, T0.Y, T0.W, T1.X, BS:VEC_102/SCL_221 1282; EG-NEXT: XOR_INT * T1.W, T4.Z, T5.W, 1283; EG-NEXT: XOR_INT T0.X, T6.W, T2.Z, 1284; EG-NEXT: XOR_INT T0.Y, PV.W, PS, 1285; EG-NEXT: ADD_INT T1.Z, PV.Z, 1, 1286; EG-NEXT: SETGE_UINT T0.W, PV.Y, T9.W, BS:VEC_021/SCL_122 1287; EG-NEXT: SUB_INT * T2.W, PV.X, T2.W, 1288; EG-NEXT: CNDE_INT T1.Y, PV.W, T0.Z, PV.Z, 1289; EG-NEXT: SUB_INT T2.Z, PV.Y, T1.W, 1290; EG-NEXT: XOR_INT T0.W, T3.Y, T8.W, BS:VEC_021/SCL_122 1291; EG-NEXT: XOR_INT * T1.W, T4.Y, PV.X, 1292; EG-NEXT: SUB_INT T2.Y, PS, T0.X, 1293; EG-NEXT: XOR_INT * T1.W, PV.Y, PV.W, 1294; EG-NEXT: SUB_INT T2.X, PV.W, T0.W, 1295; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1296; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1297 %den_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 1298 %num = load <4 x i32>, ptr addrspace(1) %in 1299 %den = load <4 x i32>, ptr addrspace(1) %den_ptr 1300 %result = sdiv <4 x i32> %num, %den 1301 store <4 x i32> %result, ptr addrspace(1) %out 1302 ret void 1303} 1304 1305define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1306; GCN-LABEL: sdiv_v4i32_4: 1307; GCN: ; %bb.0: 1308; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1309; GCN-NEXT: s_mov_b32 s7, 0xf000 1310; GCN-NEXT: s_mov_b32 s6, -1 1311; GCN-NEXT: s_mov_b32 s10, s6 1312; GCN-NEXT: s_mov_b32 s11, s7 1313; GCN-NEXT: s_waitcnt lgkmcnt(0) 1314; GCN-NEXT: s_mov_b32 s8, s2 1315; GCN-NEXT: s_mov_b32 s9, s3 1316; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1317; GCN-NEXT: s_mov_b32 s4, s0 1318; GCN-NEXT: s_mov_b32 s5, s1 1319; GCN-NEXT: s_waitcnt vmcnt(0) 1320; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1321; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1322; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1323; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1324; GCN-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1325; GCN-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1326; GCN-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1327; GCN-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1328; GCN-NEXT: v_add_i32_e32 v0, vcc, v4, v0 1329; GCN-NEXT: v_add_i32_e32 v1, vcc, v5, v1 1330; GCN-NEXT: v_add_i32_e32 v2, vcc, v6, v2 1331; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 1332; GCN-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1333; GCN-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1334; GCN-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1335; GCN-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1336; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1337; GCN-NEXT: s_endpgm 1338; 1339; TONGA-LABEL: sdiv_v4i32_4: 1340; TONGA: ; %bb.0: 1341; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 1342; TONGA-NEXT: s_mov_b32 s3, 0xf000 1343; TONGA-NEXT: s_mov_b32 s2, -1 1344; TONGA-NEXT: s_mov_b32 s10, s2 1345; TONGA-NEXT: s_mov_b32 s11, s3 1346; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1347; TONGA-NEXT: s_mov_b32 s8, s6 1348; TONGA-NEXT: s_mov_b32 s9, s7 1349; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1350; TONGA-NEXT: s_mov_b32 s0, s4 1351; TONGA-NEXT: s_mov_b32 s1, s5 1352; TONGA-NEXT: s_waitcnt vmcnt(0) 1353; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1354; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1355; TONGA-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1356; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1357; TONGA-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1358; TONGA-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1359; TONGA-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1360; TONGA-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1361; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 1362; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1 1363; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v2 1364; TONGA-NEXT: v_add_u32_e32 v3, vcc, v7, v3 1365; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1366; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1367; TONGA-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1368; TONGA-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1369; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1370; TONGA-NEXT: s_endpgm 1371; 1372; GFX9-LABEL: sdiv_v4i32_4: 1373; GFX9: ; %bb.0: 1374; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1375; GFX9-NEXT: s_mov_b32 s7, 0xf000 1376; GFX9-NEXT: s_mov_b32 s6, -1 1377; GFX9-NEXT: s_mov_b32 s10, s6 1378; GFX9-NEXT: s_mov_b32 s11, s7 1379; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1380; GFX9-NEXT: s_mov_b32 s8, s2 1381; GFX9-NEXT: s_mov_b32 s9, s3 1382; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 1383; GFX9-NEXT: s_mov_b32 s4, s0 1384; GFX9-NEXT: s_mov_b32 s5, s1 1385; GFX9-NEXT: s_waitcnt vmcnt(0) 1386; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 1387; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1 1388; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v2 1389; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v3 1390; GFX9-NEXT: v_lshrrev_b32_e32 v4, 30, v4 1391; GFX9-NEXT: v_lshrrev_b32_e32 v5, 30, v5 1392; GFX9-NEXT: v_lshrrev_b32_e32 v6, 30, v6 1393; GFX9-NEXT: v_lshrrev_b32_e32 v7, 30, v7 1394; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 1395; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 1396; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 1397; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 1398; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0 1399; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1 1400; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2 1401; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3 1402; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1403; GFX9-NEXT: s_endpgm 1404; 1405; EG-LABEL: sdiv_v4i32_4: 1406; EG: ; %bb.0: 1407; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1408; EG-NEXT: TEX 0 @6 1409; EG-NEXT: ALU 24, @9, KC0[CB0:0-32], KC1[] 1410; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 1411; EG-NEXT: CF_END 1412; EG-NEXT: PAD 1413; EG-NEXT: Fetch clause starting at 6: 1414; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1415; EG-NEXT: ALU clause starting at 8: 1416; EG-NEXT: MOV * T0.X, KC0[2].Z, 1417; EG-NEXT: ALU clause starting at 9: 1418; EG-NEXT: ASHR T1.W, T0.W, literal.x, 1419; EG-NEXT: ASHR * T2.W, T0.Z, literal.x, 1420; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1421; EG-NEXT: LSHR * T1.W, PV.W, literal.x, 1422; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1423; EG-NEXT: ADD_INT T1.Z, T0.W, PV.W, 1424; EG-NEXT: LSHR T0.W, T2.W, literal.x, BS:VEC_120/SCL_212 1425; EG-NEXT: ASHR * T1.W, T0.Y, literal.y, 1426; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1427; EG-NEXT: LSHR T1.Y, PS, literal.x, 1428; EG-NEXT: ASHR T2.Z, T0.X, literal.y, 1429; EG-NEXT: ADD_INT T0.W, T0.Z, PV.W, 1430; EG-NEXT: ASHR * T1.W, PV.Z, literal.z, 1431; EG-NEXT: 30(4.203895e-44), 31(4.344025e-44) 1432; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1433; EG-NEXT: ASHR T1.Z, PV.W, literal.x, 1434; EG-NEXT: LSHR T0.W, PV.Z, literal.y, 1435; EG-NEXT: ADD_INT * T2.W, T0.Y, PV.Y, 1436; EG-NEXT: 2(2.802597e-45), 30(4.203895e-44) 1437; EG-NEXT: ASHR T1.Y, PS, literal.x, 1438; EG-NEXT: ADD_INT * T0.W, T0.X, PV.W, 1439; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1440; EG-NEXT: ASHR T1.X, PV.W, literal.x, 1441; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1442; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1443 %num = load <4 x i32>, ptr addrspace(1) %in 1444 %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4> 1445 store <4 x i32> %result, ptr addrspace(1) %out 1446 ret void 1447} 1448 1449define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1450; GCN-LABEL: v_sdiv_i8: 1451; GCN: ; %bb.0: 1452; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1453; GCN-NEXT: s_mov_b32 s7, 0xf000 1454; GCN-NEXT: s_mov_b32 s6, -1 1455; GCN-NEXT: s_mov_b32 s10, s6 1456; GCN-NEXT: s_mov_b32 s11, s7 1457; GCN-NEXT: s_waitcnt lgkmcnt(0) 1458; GCN-NEXT: s_mov_b32 s8, s2 1459; GCN-NEXT: s_mov_b32 s9, s3 1460; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1461; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1462; GCN-NEXT: s_mov_b32 s4, s0 1463; GCN-NEXT: s_mov_b32 s5, s1 1464; GCN-NEXT: s_waitcnt vmcnt(1) 1465; GCN-NEXT: v_cvt_f32_i32_e32 v2, v0 1466; GCN-NEXT: s_waitcnt vmcnt(0) 1467; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1 1468; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 1469; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1470; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1471; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1472; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 1473; GCN-NEXT: v_trunc_f32_e32 v1, v1 1474; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 1475; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 1476; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1477; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1478; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1479; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 1480; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1481; GCN-NEXT: s_endpgm 1482; 1483; TONGA-LABEL: v_sdiv_i8: 1484; TONGA: ; %bb.0: 1485; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1486; TONGA-NEXT: s_mov_b32 s7, 0xf000 1487; TONGA-NEXT: s_mov_b32 s6, -1 1488; TONGA-NEXT: s_mov_b32 s10, s6 1489; TONGA-NEXT: s_mov_b32 s11, s7 1490; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1491; TONGA-NEXT: s_mov_b32 s8, s2 1492; TONGA-NEXT: s_mov_b32 s9, s3 1493; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1494; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1495; TONGA-NEXT: s_mov_b32 s4, s0 1496; TONGA-NEXT: s_mov_b32 s5, s1 1497; TONGA-NEXT: s_waitcnt vmcnt(1) 1498; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0 1499; TONGA-NEXT: s_waitcnt vmcnt(0) 1500; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v1 1501; TONGA-NEXT: v_xor_b32_e32 v0, v1, v0 1502; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1503; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 1504; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1505; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 1506; TONGA-NEXT: v_trunc_f32_e32 v1, v1 1507; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 1508; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 1509; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1510; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1511; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1512; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8 1513; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0 1514; TONGA-NEXT: s_endpgm 1515; 1516; GFX9-LABEL: v_sdiv_i8: 1517; GFX9: ; %bb.0: 1518; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1519; GFX9-NEXT: s_mov_b32 s7, 0xf000 1520; GFX9-NEXT: s_mov_b32 s6, -1 1521; GFX9-NEXT: s_mov_b32 s10, s6 1522; GFX9-NEXT: s_mov_b32 s11, s7 1523; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1524; GFX9-NEXT: s_mov_b32 s8, s2 1525; GFX9-NEXT: s_mov_b32 s9, s3 1526; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1 1527; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 1528; GFX9-NEXT: s_mov_b32 s4, s0 1529; GFX9-NEXT: s_mov_b32 s5, s1 1530; GFX9-NEXT: s_waitcnt vmcnt(1) 1531; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0 1532; GFX9-NEXT: s_waitcnt vmcnt(0) 1533; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 1534; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 1535; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1536; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 1537; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1538; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 1539; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1540; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 1541; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 1542; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 1543; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1544; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1545; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8 1546; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 1547; GFX9-NEXT: s_endpgm 1548; 1549; EG-LABEL: v_sdiv_i8: 1550; EG: ; %bb.0: 1551; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1552; EG-NEXT: TEX 1 @6 1553; EG-NEXT: ALU 21, @11, KC0[CB0:0-32], KC1[] 1554; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1555; EG-NEXT: CF_END 1556; EG-NEXT: PAD 1557; EG-NEXT: Fetch clause starting at 6: 1558; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 1559; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1560; EG-NEXT: ALU clause starting at 10: 1561; EG-NEXT: MOV * T0.X, KC0[2].Z, 1562; EG-NEXT: ALU clause starting at 11: 1563; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 1564; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1565; EG-NEXT: INT_TO_FLT * T0.Y, PV.W, 1566; EG-NEXT: BFE_INT T1.W, T0.X, 0.0, literal.x, 1567; EG-NEXT: RECIP_IEEE * T0.X, PS, 1568; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1569; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1570; EG-NEXT: MUL_IEEE * T2.W, PS, T0.X, 1571; EG-NEXT: TRUNC T2.W, PV.W, 1572; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1573; EG-NEXT: ASHR T0.W, PS, literal.x, 1574; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.Y, T0.Z, 1575; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1576; EG-NEXT: TRUNC T0.Z, T2.W, 1577; EG-NEXT: SETGE T1.W, |PS|, |T0.Y|, 1578; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1579; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1580; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1581; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1582; EG-NEXT: BFE_INT T0.X, PV.W, 0.0, literal.x, 1583; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1584; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 1585 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 1586 %num = load i8, ptr addrspace(1) %in 1587 %den = load i8, ptr addrspace(1) %den_ptr 1588 %result = sdiv i8 %num, %den 1589 %result.ext = sext i8 %result to i32 1590 store i32 %result.ext, ptr addrspace(1) %out 1591 ret void 1592} 1593 1594define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1595; GCN-LABEL: v_sdiv_i23: 1596; GCN: ; %bb.0: 1597; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1598; GCN-NEXT: s_mov_b32 s7, 0xf000 1599; GCN-NEXT: s_mov_b32 s6, -1 1600; GCN-NEXT: s_mov_b32 s10, s6 1601; GCN-NEXT: s_mov_b32 s11, s7 1602; GCN-NEXT: s_waitcnt lgkmcnt(0) 1603; GCN-NEXT: s_mov_b32 s8, s2 1604; GCN-NEXT: s_mov_b32 s9, s3 1605; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 1606; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 1607; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1608; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1609; GCN-NEXT: s_mov_b32 s4, s0 1610; GCN-NEXT: s_mov_b32 s5, s1 1611; GCN-NEXT: s_waitcnt vmcnt(3) 1612; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1613; GCN-NEXT: s_waitcnt vmcnt(2) 1614; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1615; GCN-NEXT: s_waitcnt vmcnt(1) 1616; GCN-NEXT: v_or_b32_e32 v1, v2, v1 1617; GCN-NEXT: v_bfe_i32 v1, v1, 0, 23 1618; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 1619; GCN-NEXT: s_waitcnt vmcnt(0) 1620; GCN-NEXT: v_or_b32_e32 v0, v3, v0 1621; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1622; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0 1623; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1624; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 1625; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1626; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1627; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 1628; GCN-NEXT: v_trunc_f32_e32 v1, v1 1629; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 1630; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 1631; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1632; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1633; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1634; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 1635; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1636; GCN-NEXT: s_endpgm 1637; 1638; TONGA-LABEL: v_sdiv_i23: 1639; TONGA: ; %bb.0: 1640; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 1641; TONGA-NEXT: s_mov_b32 s3, 0xf000 1642; TONGA-NEXT: s_mov_b32 s2, -1 1643; TONGA-NEXT: s_mov_b32 s10, s2 1644; TONGA-NEXT: s_mov_b32 s11, s3 1645; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1646; TONGA-NEXT: s_mov_b32 s8, s6 1647; TONGA-NEXT: s_mov_b32 s9, s7 1648; TONGA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 1649; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:6 1650; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4 1651; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1652; TONGA-NEXT: s_mov_b32 s0, s4 1653; TONGA-NEXT: s_mov_b32 s1, s5 1654; TONGA-NEXT: s_waitcnt vmcnt(3) 1655; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1656; TONGA-NEXT: s_waitcnt vmcnt(2) 1657; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1658; TONGA-NEXT: s_waitcnt vmcnt(1) 1659; TONGA-NEXT: v_or_b32_e32 v1, v2, v1 1660; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 23 1661; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v1 1662; TONGA-NEXT: s_waitcnt vmcnt(0) 1663; TONGA-NEXT: v_or_b32_e32 v0, v3, v0 1664; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1665; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v0 1666; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v2 1667; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 1668; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1669; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1670; TONGA-NEXT: v_mul_f32_e32 v1, v3, v4 1671; TONGA-NEXT: v_trunc_f32_e32 v1, v1 1672; TONGA-NEXT: v_mad_f32 v3, -v1, v2, v3 1673; TONGA-NEXT: v_cvt_i32_f32_e32 v1, v1 1674; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 1675; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1676; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 1677; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 23 1678; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 1679; TONGA-NEXT: s_endpgm 1680; 1681; GFX9-LABEL: v_sdiv_i23: 1682; GFX9: ; %bb.0: 1683; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1684; GFX9-NEXT: s_mov_b32 s3, 0xf000 1685; GFX9-NEXT: s_mov_b32 s2, -1 1686; GFX9-NEXT: s_mov_b32 s6, s2 1687; GFX9-NEXT: s_mov_b32 s7, s3 1688; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1689; GFX9-NEXT: s_mov_b32 s4, s10 1690; GFX9-NEXT: s_mov_b32 s5, s11 1691; GFX9-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:2 1692; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:6 1693; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4 1694; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 1695; GFX9-NEXT: s_mov_b32 s0, s8 1696; GFX9-NEXT: s_mov_b32 s1, s9 1697; GFX9-NEXT: s_waitcnt vmcnt(3) 1698; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1699; GFX9-NEXT: s_waitcnt vmcnt(2) 1700; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1701; GFX9-NEXT: s_waitcnt vmcnt(1) 1702; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 1703; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 23 1704; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v1 1705; GFX9-NEXT: s_waitcnt vmcnt(0) 1706; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 1707; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1708; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v0 1709; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 1710; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 1711; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1712; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1713; GFX9-NEXT: v_mul_f32_e32 v1, v3, v4 1714; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1715; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v1 1716; GFX9-NEXT: v_mad_f32 v1, -v1, v2, v3 1717; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 1718; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1719; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1720; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 23 1721; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1722; GFX9-NEXT: s_endpgm 1723; 1724; EG-LABEL: v_sdiv_i23: 1725; EG: ; %bb.0: 1726; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1727; EG-NEXT: TEX 3 @6 1728; EG-NEXT: ALU 33, @15, KC0[CB0:0-32], KC1[] 1729; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1730; EG-NEXT: CF_END 1731; EG-NEXT: PAD 1732; EG-NEXT: Fetch clause starting at 6: 1733; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1734; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1735; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1736; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1737; EG-NEXT: ALU clause starting at 14: 1738; EG-NEXT: MOV * T0.X, KC0[2].Z, 1739; EG-NEXT: ALU clause starting at 15: 1740; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1741; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1742; EG-NEXT: OR_INT T0.W, T0.X, PV.W, 1743; EG-NEXT: LSHL * T1.W, T3.X, literal.x, 1744; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1745; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1746; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1747; EG-NEXT: ASHR T0.W, PV.W, literal.x, 1748; EG-NEXT: OR_INT * T1.W, T2.X, T1.W, 1749; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1750; EG-NEXT: LSHL T1.W, PS, literal.x, 1751; EG-NEXT: INT_TO_FLT * T0.X, PV.W, 1752; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1753; EG-NEXT: ASHR T1.W, PV.W, literal.x, 1754; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1755; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1756; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1757; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y, 1758; EG-NEXT: TRUNC T2.W, PV.W, 1759; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 1760; EG-NEXT: ASHR T0.W, PS, literal.x, 1761; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z, 1762; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1763; EG-NEXT: TRUNC T0.Z, T2.W, 1764; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 1765; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1766; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1767; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1768; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1769; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1770; EG-NEXT: 9(1.261169e-44), 0(0.000000e+00) 1771; EG-NEXT: ASHR T0.X, PV.W, literal.x, 1772; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1773; EG-NEXT: 9(1.261169e-44), 2(2.802597e-45) 1774 %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1 1775 %num = load i23, ptr addrspace(1) %in 1776 %den = load i23, ptr addrspace(1) %den_ptr 1777 %result = sdiv i23 %num, %den 1778 %result.ext = sext i23 %result to i32 1779 store i32 %result.ext, ptr addrspace(1) %out 1780 ret void 1781} 1782 1783define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1784; GCN-LABEL: v_sdiv_i24: 1785; GCN: ; %bb.0: 1786; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1787; GCN-NEXT: s_mov_b32 s7, 0xf000 1788; GCN-NEXT: s_mov_b32 s6, -1 1789; GCN-NEXT: s_mov_b32 s10, s6 1790; GCN-NEXT: s_mov_b32 s11, s7 1791; GCN-NEXT: s_waitcnt lgkmcnt(0) 1792; GCN-NEXT: s_mov_b32 s8, s2 1793; GCN-NEXT: s_mov_b32 s9, s3 1794; GCN-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 1795; GCN-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1796; GCN-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 1797; GCN-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1798; GCN-NEXT: s_mov_b32 s4, s0 1799; GCN-NEXT: s_mov_b32 s5, s1 1800; GCN-NEXT: s_waitcnt vmcnt(3) 1801; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1802; GCN-NEXT: s_waitcnt vmcnt(2) 1803; GCN-NEXT: v_or_b32_e32 v1, v1, v4 1804; GCN-NEXT: v_cvt_f32_i32_e32 v1, v1 1805; GCN-NEXT: s_waitcnt vmcnt(1) 1806; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1807; GCN-NEXT: s_waitcnt vmcnt(0) 1808; GCN-NEXT: v_or_b32_e32 v3, v3, v4 1809; GCN-NEXT: v_cvt_f32_i32_e32 v3, v3 1810; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 1811; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 1812; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1813; GCN-NEXT: v_or_b32_e32 v0, 1, v0 1814; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 1815; GCN-NEXT: v_trunc_f32_e32 v2, v2 1816; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 1817; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 1818; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1819; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1820; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1821; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 1822; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 1823; GCN-NEXT: s_endpgm 1824; 1825; TONGA-LABEL: v_sdiv_i24: 1826; TONGA: ; %bb.0: 1827; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 1828; TONGA-NEXT: s_mov_b32 s3, 0xf000 1829; TONGA-NEXT: s_mov_b32 s2, -1 1830; TONGA-NEXT: s_mov_b32 s10, s2 1831; TONGA-NEXT: s_mov_b32 s11, s3 1832; TONGA-NEXT: s_waitcnt lgkmcnt(0) 1833; TONGA-NEXT: s_mov_b32 s8, s6 1834; TONGA-NEXT: s_mov_b32 s9, s7 1835; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:6 1836; TONGA-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1837; TONGA-NEXT: buffer_load_sbyte v2, off, s[8:11], 0 offset:2 1838; TONGA-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1839; TONGA-NEXT: s_mov_b32 s0, s4 1840; TONGA-NEXT: s_mov_b32 s1, s5 1841; TONGA-NEXT: s_waitcnt vmcnt(3) 1842; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1843; TONGA-NEXT: s_waitcnt vmcnt(2) 1844; TONGA-NEXT: v_or_b32_e32 v1, v1, v4 1845; TONGA-NEXT: v_cvt_f32_i32_e32 v1, v1 1846; TONGA-NEXT: s_waitcnt vmcnt(1) 1847; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1848; TONGA-NEXT: s_waitcnt vmcnt(0) 1849; TONGA-NEXT: v_or_b32_e32 v3, v3, v4 1850; TONGA-NEXT: v_cvt_f32_i32_e32 v3, v3 1851; TONGA-NEXT: v_rcp_iflag_f32_e32 v4, v1 1852; TONGA-NEXT: v_xor_b32_e32 v0, v2, v0 1853; TONGA-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1854; TONGA-NEXT: v_or_b32_e32 v0, 1, v0 1855; TONGA-NEXT: v_mul_f32_e32 v2, v3, v4 1856; TONGA-NEXT: v_trunc_f32_e32 v2, v2 1857; TONGA-NEXT: v_mad_f32 v3, -v2, v1, v3 1858; TONGA-NEXT: v_cvt_i32_f32_e32 v2, v2 1859; TONGA-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| 1860; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1861; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1862; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 24 1863; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 1864; TONGA-NEXT: s_endpgm 1865; 1866; GFX9-LABEL: v_sdiv_i24: 1867; GFX9: ; %bb.0: 1868; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1869; GFX9-NEXT: s_mov_b32 s3, 0xf000 1870; GFX9-NEXT: s_mov_b32 s2, -1 1871; GFX9-NEXT: s_mov_b32 s6, s2 1872; GFX9-NEXT: s_mov_b32 s7, s3 1873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1874; GFX9-NEXT: s_mov_b32 s4, s10 1875; GFX9-NEXT: s_mov_b32 s5, s11 1876; GFX9-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 offset:6 1877; GFX9-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:4 1878; GFX9-NEXT: buffer_load_sbyte v2, off, s[4:7], 0 offset:2 1879; GFX9-NEXT: buffer_load_ushort v3, off, s[4:7], 0 1880; GFX9-NEXT: s_mov_b32 s0, s8 1881; GFX9-NEXT: s_mov_b32 s1, s9 1882; GFX9-NEXT: s_waitcnt vmcnt(3) 1883; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 1884; GFX9-NEXT: s_waitcnt vmcnt(2) 1885; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 1886; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 1887; GFX9-NEXT: s_waitcnt vmcnt(1) 1888; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 1889; GFX9-NEXT: s_waitcnt vmcnt(0) 1890; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 1891; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 1892; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 1893; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 1894; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 1895; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 1896; GFX9-NEXT: v_mul_f32_e32 v2, v3, v4 1897; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1898; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v2 1899; GFX9-NEXT: v_mad_f32 v2, -v2, v1, v3 1900; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 1901; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1902; GFX9-NEXT: v_add_u32_e32 v0, v4, v0 1903; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 24 1904; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 1905; GFX9-NEXT: s_endpgm 1906; 1907; EG-LABEL: v_sdiv_i24: 1908; EG: ; %bb.0: 1909; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1910; EG-NEXT: TEX 3 @6 1911; EG-NEXT: ALU 29, @15, KC0[CB0:0-32], KC1[] 1912; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1913; EG-NEXT: CF_END 1914; EG-NEXT: PAD 1915; EG-NEXT: Fetch clause starting at 6: 1916; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1917; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1918; EG-NEXT: VTX_READ_16 T3.X, T0.X, 4, #1 1919; EG-NEXT: VTX_READ_8 T0.X, T0.X, 2, #1 1920; EG-NEXT: ALU clause starting at 14: 1921; EG-NEXT: MOV * T0.X, KC0[2].Z, 1922; EG-NEXT: ALU clause starting at 15: 1923; EG-NEXT: BFE_INT * T0.W, T1.X, 0.0, literal.x, 1924; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1925; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 1926; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1927; EG-NEXT: BFE_INT T2.W, T0.X, 0.0, literal.x, 1928; EG-NEXT: OR_INT * T1.W, T3.X, PV.W, 1929; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1930; EG-NEXT: LSHL T3.W, PV.W, literal.x, 1931; EG-NEXT: INT_TO_FLT * T0.X, PS, 1932; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1933; EG-NEXT: OR_INT T1.W, T2.X, PV.W, 1934; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1935; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 1936; EG-NEXT: MUL_IEEE * T1.W, PS, T0.Y, 1937; EG-NEXT: TRUNC T1.W, PV.W, 1938; EG-NEXT: XOR_INT * T0.W, T2.W, T0.W, 1939; EG-NEXT: ASHR T0.W, PS, literal.x, 1940; EG-NEXT: MULADD_IEEE * T2.W, -PV.W, T0.X, T0.Z, 1941; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 1942; EG-NEXT: TRUNC T0.Z, T1.W, 1943; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 1944; EG-NEXT: OR_INT * T0.W, PV.W, 1, 1945; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 1946; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 1947; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1948; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 1949; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 1950; EG-NEXT: ASHR T0.X, PV.W, literal.x, 1951; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1952; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 1953 %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1 1954 %num = load i24, ptr addrspace(1) %in 1955 %den = load i24, ptr addrspace(1) %den_ptr 1956 %result = sdiv i24 %num, %den 1957 %result.ext = sext i24 %result to i32 1958 store i32 %result.ext, ptr addrspace(1) %out 1959 ret void 1960} 1961 1962define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1963; GCN-LABEL: v_sdiv_i25: 1964; GCN: ; %bb.0: 1965; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 1966; GCN-NEXT: s_mov_b32 s3, 0xf000 1967; GCN-NEXT: s_mov_b32 s2, -1 1968; GCN-NEXT: s_mov_b32 s10, s2 1969; GCN-NEXT: s_mov_b32 s11, s3 1970; GCN-NEXT: s_waitcnt lgkmcnt(0) 1971; GCN-NEXT: s_mov_b32 s8, s6 1972; GCN-NEXT: s_mov_b32 s9, s7 1973; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 1974; GCN-NEXT: s_mov_b32 s0, s4 1975; GCN-NEXT: s_mov_b32 s1, s5 1976; GCN-NEXT: s_waitcnt vmcnt(0) 1977; GCN-NEXT: v_bfe_i32 v1, v1, 0, 25 1978; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 1979; GCN-NEXT: v_max_i32_e32 v2, v1, v2 1980; GCN-NEXT: v_cvt_f32_u32_e32 v3, v2 1981; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 1982; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 1983; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1984; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 1985; GCN-NEXT: v_max_i32_e32 v5, v0, v5 1986; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1987; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1988; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 1989; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0 1990; GCN-NEXT: v_mul_lo_u32 v4, v4, v3 1991; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 1992; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1993; GCN-NEXT: v_mul_hi_u32 v3, v5, v3 1994; GCN-NEXT: v_mul_lo_u32 v1, v3, v2 1995; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 1996; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 1997; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1 1998; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 1999; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2000; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2001; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3 2002; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 2003; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2004; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 2005; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 2006; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 2007; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 2008; GCN-NEXT: s_endpgm 2009; 2010; TONGA-LABEL: v_sdiv_i25: 2011; TONGA: ; %bb.0: 2012; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 2013; TONGA-NEXT: s_mov_b32 s3, 0xf000 2014; TONGA-NEXT: s_mov_b32 s2, -1 2015; TONGA-NEXT: s_mov_b32 s10, s2 2016; TONGA-NEXT: s_mov_b32 s11, s3 2017; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2018; TONGA-NEXT: s_mov_b32 s8, s6 2019; TONGA-NEXT: s_mov_b32 s9, s7 2020; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 2021; TONGA-NEXT: s_mov_b32 s0, s4 2022; TONGA-NEXT: s_mov_b32 s1, s5 2023; TONGA-NEXT: s_waitcnt vmcnt(0) 2024; TONGA-NEXT: v_bfe_i32 v1, v1, 0, 25 2025; TONGA-NEXT: v_sub_u32_e32 v2, vcc, 0, v1 2026; TONGA-NEXT: v_max_i32_e32 v2, v1, v2 2027; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v2 2028; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 2029; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 2030; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 2031; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 2032; TONGA-NEXT: v_max_i32_e32 v5, v0, v5 2033; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2034; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 2035; TONGA-NEXT: v_xor_b32_e32 v0, v0, v1 2036; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v0 2037; TONGA-NEXT: v_mul_lo_u32 v4, v4, v3 2038; TONGA-NEXT: v_mul_hi_u32 v4, v3, v4 2039; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v4 2040; TONGA-NEXT: v_mul_hi_u32 v3, v5, v3 2041; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2 2042; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 2043; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1 2044; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1 2045; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 2046; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2047; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2048; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3 2049; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 2050; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc 2051; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 2052; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 2053; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 2054; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 2055; TONGA-NEXT: s_endpgm 2056; 2057; GFX9-LABEL: v_sdiv_i25: 2058; GFX9: ; %bb.0: 2059; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2060; GFX9-NEXT: s_mov_b32 s3, 0xf000 2061; GFX9-NEXT: s_mov_b32 s2, -1 2062; GFX9-NEXT: s_mov_b32 s6, s2 2063; GFX9-NEXT: s_mov_b32 s7, s3 2064; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2065; GFX9-NEXT: s_mov_b32 s4, s10 2066; GFX9-NEXT: s_mov_b32 s5, s11 2067; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 2068; GFX9-NEXT: s_mov_b32 s1, s9 2069; GFX9-NEXT: s_waitcnt vmcnt(0) 2070; GFX9-NEXT: v_readfirstlane_b32 s0, v1 2071; GFX9-NEXT: s_bfe_i32 s4, s0, 0x190000 2072; GFX9-NEXT: s_abs_i32 s5, s4 2073; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 2074; GFX9-NEXT: v_readfirstlane_b32 s6, v0 2075; GFX9-NEXT: s_mov_b32 s0, s8 2076; GFX9-NEXT: s_sub_i32 s7, 0, s5 2077; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2078; GFX9-NEXT: s_bfe_i32 s6, s6, 0x190000 2079; GFX9-NEXT: s_xor_b32 s4, s6, s4 2080; GFX9-NEXT: s_abs_i32 s6, s6 2081; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 2082; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2083; GFX9-NEXT: s_ashr_i32 s4, s4, 31 2084; GFX9-NEXT: v_readfirstlane_b32 s8, v0 2085; GFX9-NEXT: s_mul_i32 s7, s7, s8 2086; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 2087; GFX9-NEXT: s_add_i32 s8, s8, s7 2088; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8 2089; GFX9-NEXT: s_mul_i32 s8, s7, s5 2090; GFX9-NEXT: s_sub_i32 s6, s6, s8 2091; GFX9-NEXT: s_add_i32 s9, s7, 1 2092; GFX9-NEXT: s_sub_i32 s8, s6, s5 2093; GFX9-NEXT: s_cmp_ge_u32 s6, s5 2094; GFX9-NEXT: s_cselect_b32 s7, s9, s7 2095; GFX9-NEXT: s_cselect_b32 s6, s8, s6 2096; GFX9-NEXT: s_add_i32 s8, s7, 1 2097; GFX9-NEXT: s_cmp_ge_u32 s6, s5 2098; GFX9-NEXT: s_cselect_b32 s5, s8, s7 2099; GFX9-NEXT: s_xor_b32 s5, s5, s4 2100; GFX9-NEXT: s_sub_i32 s4, s5, s4 2101; GFX9-NEXT: s_bfe_i32 s4, s4, 0x190000 2102; GFX9-NEXT: v_mov_b32_e32 v0, s4 2103; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 2104; GFX9-NEXT: s_endpgm 2105; 2106; EG-LABEL: v_sdiv_i25: 2107; EG: ; %bb.0: 2108; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 2109; EG-NEXT: TEX 1 @6 2110; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] 2111; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2112; EG-NEXT: CF_END 2113; EG-NEXT: PAD 2114; EG-NEXT: Fetch clause starting at 6: 2115; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 2116; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 2117; EG-NEXT: ALU clause starting at 10: 2118; EG-NEXT: MOV * T0.X, KC0[2].Z, 2119; EG-NEXT: MOV * T1.X, PV.X, 2120; EG-NEXT: ALU clause starting at 12: 2121; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2122; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2123; EG-NEXT: ASHR * T0.W, PV.W, literal.x, 2124; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2125; EG-NEXT: SETGT_INT * T1.W, 0.0, PV.W, 2126; EG-NEXT: ADD_INT T0.W, T0.W, PV.W, 2127; EG-NEXT: LSHL * T2.W, T1.X, literal.x, 2128; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2129; EG-NEXT: XOR_INT * T0.W, PV.W, T1.W, 2130; EG-NEXT: SUB_INT T0.Z, 0.0, PV.W, 2131; EG-NEXT: ASHR T2.W, T2.W, literal.x, 2132; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2133; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2134; EG-NEXT: SETGT_INT T3.W, 0.0, PV.W, 2135; EG-NEXT: MULLO_INT * T0.Y, PV.Z, PS, 2136; EG-NEXT: ADD_INT T2.W, T2.W, PV.W, 2137; EG-NEXT: MULHI * T0.Y, T0.X, PS, 2138; EG-NEXT: ADD_INT T4.W, T0.X, PS, 2139; EG-NEXT: XOR_INT * T2.W, PV.W, T3.W, 2140; EG-NEXT: MULHI * T0.X, PS, PV.W, 2141; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2142; EG-NEXT: SUB_INT * T2.W, T2.W, PS, 2143; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 2144; EG-NEXT: SETGE_UINT T4.W, PV.W, T0.W, 2145; EG-NEXT: SUB_INT * T5.W, PV.W, T0.W, 2146; EG-NEXT: CNDE_INT T2.W, PV.W, T2.W, PS, 2147; EG-NEXT: CNDE_INT * T4.W, PV.W, T0.X, PV.Z, 2148; EG-NEXT: ADD_INT T5.W, PS, 1, 2149; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, 2150; EG-NEXT: CNDE_INT T0.W, PS, T4.W, PV.W, BS:VEC_102/SCL_221 2151; EG-NEXT: XOR_INT * T1.W, T3.W, T1.W, 2152; EG-NEXT: XOR_INT * T0.W, PV.W, PS, 2153; EG-NEXT: SUB_INT * T0.W, PV.W, T1.W, 2154; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 2155; EG-NEXT: 7(9.809089e-45), 0(0.000000e+00) 2156; EG-NEXT: ASHR T0.X, PV.W, literal.x, 2157; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2158; EG-NEXT: 7(9.809089e-45), 2(2.802597e-45) 2159 %den_ptr = getelementptr i25, ptr addrspace(1) %in, i25 1 2160 %num = load i25, ptr addrspace(1) %in 2161 %den = load i25, ptr addrspace(1) %den_ptr 2162 %result = sdiv i25 %num, %den 2163 %result.ext = sext i25 %result to i32 2164 store i32 %result.ext, ptr addrspace(1) %out 2165 ret void 2166} 2167 2168; Tests for 64-bit divide bypass. 2169; define amdgpu_kernel void @test_get_quotient(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 2170; %result = sdiv i64 %a, %b 2171; store i64 %result, ptr addrspace(1) %out, align 8 2172; ret void 2173; } 2174 2175; define amdgpu_kernel void @test_get_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 2176; %result = srem i64 %a, %b 2177; store i64 %result, ptr addrspace(1) %out, align 8 2178; ret void 2179; } 2180 2181; define amdgpu_kernel void @test_get_quotient_and_remainder(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { 2182; %resultdiv = sdiv i64 %a, %b 2183; %resultrem = srem i64 %a, %b 2184; %result = add i64 %resultdiv, %resultrem 2185; store i64 %result, ptr addrspace(1) %out, align 8 2186; ret void 2187; } 2188 2189define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { 2190; GCN-LABEL: scalarize_mulhs_4xi32: 2191; GCN: ; %bb.0: 2192; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2193; GCN-NEXT: s_mov_b32 s7, 0xf000 2194; GCN-NEXT: s_mov_b32 s6, -1 2195; GCN-NEXT: s_waitcnt lgkmcnt(0) 2196; GCN-NEXT: s_mov_b32 s4, s0 2197; GCN-NEXT: s_mov_b32 s5, s1 2198; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2199; GCN-NEXT: s_mov_b32 s0, 0x1389c755 2200; GCN-NEXT: s_mov_b32 s4, s2 2201; GCN-NEXT: s_mov_b32 s5, s3 2202; GCN-NEXT: s_waitcnt vmcnt(0) 2203; GCN-NEXT: v_mul_hi_i32 v0, v0, s0 2204; GCN-NEXT: v_mul_hi_i32 v1, v1, s0 2205; GCN-NEXT: v_mul_hi_i32 v2, v2, s0 2206; GCN-NEXT: v_mul_hi_i32 v3, v3, s0 2207; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2208; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2209; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2210; GCN-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2211; GCN-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2212; GCN-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2213; GCN-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2214; GCN-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2215; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 2216; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 2217; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 2218; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 2219; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2220; GCN-NEXT: s_endpgm 2221; 2222; TONGA-LABEL: scalarize_mulhs_4xi32: 2223; TONGA: ; %bb.0: 2224; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2225; TONGA-NEXT: s_mov_b32 s7, 0xf000 2226; TONGA-NEXT: s_mov_b32 s6, -1 2227; TONGA-NEXT: s_waitcnt lgkmcnt(0) 2228; TONGA-NEXT: s_mov_b32 s4, s0 2229; TONGA-NEXT: s_mov_b32 s5, s1 2230; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2231; TONGA-NEXT: s_mov_b32 s0, 0x1389c755 2232; TONGA-NEXT: s_mov_b32 s4, s2 2233; TONGA-NEXT: s_mov_b32 s5, s3 2234; TONGA-NEXT: s_waitcnt vmcnt(0) 2235; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0 2236; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0 2237; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0 2238; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0 2239; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2240; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2241; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2242; TONGA-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2243; TONGA-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2244; TONGA-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2245; TONGA-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2246; TONGA-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2247; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 2248; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 2249; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 2250; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 2251; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2252; TONGA-NEXT: s_endpgm 2253; 2254; GFX9-LABEL: scalarize_mulhs_4xi32: 2255; GFX9: ; %bb.0: 2256; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2257; GFX9-NEXT: s_mov_b32 s7, 0xf000 2258; GFX9-NEXT: s_mov_b32 s6, -1 2259; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2260; GFX9-NEXT: s_mov_b32 s4, s0 2261; GFX9-NEXT: s_mov_b32 s5, s1 2262; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2263; GFX9-NEXT: s_mov_b32 s0, 0x1389c755 2264; GFX9-NEXT: s_mov_b32 s4, s2 2265; GFX9-NEXT: s_mov_b32 s5, s3 2266; GFX9-NEXT: s_waitcnt vmcnt(0) 2267; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0 2268; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0 2269; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0 2270; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0 2271; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0 2272; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 2273; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1 2274; GFX9-NEXT: v_ashrrev_i32_e32 v1, 12, v1 2275; GFX9-NEXT: v_lshrrev_b32_e32 v6, 31, v2 2276; GFX9-NEXT: v_ashrrev_i32_e32 v2, 12, v2 2277; GFX9-NEXT: v_lshrrev_b32_e32 v7, 31, v3 2278; GFX9-NEXT: v_ashrrev_i32_e32 v3, 12, v3 2279; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 2280; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 2281; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 2282; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 2283; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2284; GFX9-NEXT: s_endpgm 2285; 2286; EG-LABEL: scalarize_mulhs_4xi32: 2287; EG: ; %bb.0: 2288; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 2289; EG-NEXT: TEX 0 @6 2290; EG-NEXT: ALU 25, @9, KC0[CB0:0-32], KC1[] 2291; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2292; EG-NEXT: CF_END 2293; EG-NEXT: PAD 2294; EG-NEXT: Fetch clause starting at 6: 2295; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2296; EG-NEXT: ALU clause starting at 8: 2297; EG-NEXT: MOV * T0.X, KC0[2].Y, 2298; EG-NEXT: ALU clause starting at 9: 2299; EG-NEXT: MULHI_INT * T0.W, T0.W, literal.x, 2300; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2301; EG-NEXT: ASHR T1.Z, PS, literal.x, 2302; EG-NEXT: LSHR T0.W, PS, literal.y, 2303; EG-NEXT: MULHI_INT * T0.Z, T0.Z, literal.z, 2304; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2305; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2306; EG-NEXT: ASHR T1.Y, PS, literal.x, 2307; EG-NEXT: LSHR T0.Z, PS, literal.y, 2308; EG-NEXT: ADD_INT T0.W, PV.Z, PV.W, 2309; EG-NEXT: MULHI_INT * T0.Y, T0.Y, literal.z, 2310; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2311; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2312; EG-NEXT: ASHR T2.Y, PS, literal.x, 2313; EG-NEXT: ADD_INT T0.Z, PV.Y, PV.Z, 2314; EG-NEXT: LSHR T1.W, PS, literal.y, 2315; EG-NEXT: MULHI_INT * T0.X, T0.X, literal.z, 2316; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2317; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2318; EG-NEXT: ADD_INT T0.Y, PV.Y, PV.W, 2319; EG-NEXT: ASHR T1.W, PS, literal.x, 2320; EG-NEXT: LSHR * T2.W, PS, literal.y, 2321; EG-NEXT: 12(1.681558e-44), 31(4.344025e-44) 2322; EG-NEXT: ADD_INT T0.X, PV.W, PS, 2323; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.x, 2324; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2325 %1 = load <4 x i32>, ptr addrspace(1) %in, align 16 2326 %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668> 2327 store <4 x i32> %2, ptr addrspace(1) %out, align 16 2328 ret void 2329} 2330