1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GCN 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck %s -check-prefixes=GFX1030 6; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG 7 8define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 9; SI-LABEL: udiv_i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_cvt_f32_u32_e32 v2, v1 24; SI-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 25; SI-NEXT: v_rcp_iflag_f32_e32 v2, v2 26; SI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 27; SI-NEXT: v_cvt_u32_f32_e32 v2, v2 28; SI-NEXT: v_mul_lo_u32 v3, v3, v2 29; SI-NEXT: v_mul_hi_u32 v3, v2, v3 30; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 31; SI-NEXT: v_mul_hi_u32 v2, v0, v2 32; SI-NEXT: v_mul_lo_u32 v3, v2, v1 33; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v2 34; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 35; SI-NEXT: v_sub_i32_e32 v3, vcc, v0, v1 36; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 37; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 38; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 39; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v2 40; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 41; SI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 42; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 43; SI-NEXT: s_endpgm 44; 45; VI-LABEL: udiv_i32: 46; VI: ; %bb.0: 47; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 48; VI-NEXT: s_mov_b32 s7, 0xf000 49; VI-NEXT: s_mov_b32 s6, -1 50; VI-NEXT: s_mov_b32 s10, s6 51; VI-NEXT: s_mov_b32 s11, s7 52; VI-NEXT: s_waitcnt lgkmcnt(0) 53; VI-NEXT: s_mov_b32 s8, s2 54; VI-NEXT: s_mov_b32 s9, s3 55; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 56; VI-NEXT: s_mov_b32 s4, s0 57; VI-NEXT: s_mov_b32 s5, s1 58; VI-NEXT: s_waitcnt vmcnt(0) 59; VI-NEXT: v_cvt_f32_u32_e32 v2, v1 60; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 61; VI-NEXT: v_rcp_iflag_f32_e32 v2, v2 62; VI-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 63; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 64; VI-NEXT: v_mul_lo_u32 v3, v3, v2 65; VI-NEXT: v_mul_hi_u32 v3, v2, v3 66; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3 67; VI-NEXT: v_mul_hi_u32 v2, v0, v2 68; VI-NEXT: v_mul_lo_u32 v3, v2, v1 69; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v2 70; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 71; VI-NEXT: v_sub_u32_e32 v3, vcc, v0, v1 72; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 73; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 74; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 75; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2 76; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 77; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 78; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 79; VI-NEXT: s_endpgm 80; 81; GCN-LABEL: udiv_i32: 82; GCN: ; %bb.0: 83; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 84; GCN-NEXT: s_waitcnt lgkmcnt(0) 85; GCN-NEXT: v_mov_b32_e32 v0, s2 86; GCN-NEXT: v_mov_b32_e32 v1, s3 87; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 88; GCN-NEXT: s_waitcnt vmcnt(0) 89; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 90; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1 91; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 92; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 93; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 94; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 95; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 96; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 97; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 98; GCN-NEXT: v_mov_b32_e32 v2, s0 99; GCN-NEXT: v_mov_b32_e32 v3, s1 100; GCN-NEXT: v_mul_lo_u32 v5, v4, v1 101; GCN-NEXT: v_add_u32_e32 v6, vcc, 1, v4 102; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 103; GCN-NEXT: v_sub_u32_e32 v5, vcc, v0, v1 104; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 105; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 106; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 107; GCN-NEXT: v_add_u32_e32 v5, vcc, 1, v4 108; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1 109; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc 110; GCN-NEXT: flat_store_dword v[2:3], v0 111; GCN-NEXT: s_endpgm 112; 113; GFX1030-LABEL: udiv_i32: 114; GFX1030: ; %bb.0: 115; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 116; GFX1030-NEXT: v_mov_b32_e32 v2, 0 117; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 118; GFX1030-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 119; GFX1030-NEXT: s_waitcnt vmcnt(0) 120; GFX1030-NEXT: v_readfirstlane_b32 s2, v1 121; GFX1030-NEXT: v_readfirstlane_b32 s5, v0 122; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 123; GFX1030-NEXT: s_sub_i32 s4, 0, s2 124; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 125; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 126; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 127; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 128; GFX1030-NEXT: s_mul_i32 s4, s4, s3 129; GFX1030-NEXT: s_mul_hi_u32 s4, s3, s4 130; GFX1030-NEXT: s_add_i32 s3, s3, s4 131; GFX1030-NEXT: s_mul_hi_u32 s3, s5, s3 132; GFX1030-NEXT: s_mul_i32 s4, s3, s2 133; GFX1030-NEXT: s_sub_i32 s4, s5, s4 134; GFX1030-NEXT: s_add_i32 s5, s3, 1 135; GFX1030-NEXT: s_sub_i32 s6, s4, s2 136; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 137; GFX1030-NEXT: s_cselect_b32 s3, s5, s3 138; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 139; GFX1030-NEXT: s_add_i32 s5, s3, 1 140; GFX1030-NEXT: s_cmp_ge_u32 s4, s2 141; GFX1030-NEXT: s_cselect_b32 s2, s5, s3 142; GFX1030-NEXT: v_mov_b32_e32 v0, s2 143; GFX1030-NEXT: global_store_dword v2, v0, s[0:1] 144; GFX1030-NEXT: s_endpgm 145; 146; EG-LABEL: udiv_i32: 147; EG: ; %bb.0: 148; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 149; EG-NEXT: TEX 0 @6 150; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[] 151; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 152; EG-NEXT: CF_END 153; EG-NEXT: PAD 154; EG-NEXT: Fetch clause starting at 6: 155; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 156; EG-NEXT: ALU clause starting at 8: 157; EG-NEXT: MOV * T0.X, KC0[2].Z, 158; EG-NEXT: ALU clause starting at 9: 159; EG-NEXT: SUB_INT T0.W, 0.0, T0.Y, 160; EG-NEXT: RECIP_UINT * T0.Z, T0.Y, 161; EG-NEXT: MULLO_INT * T0.W, PV.W, PS, 162; EG-NEXT: MULHI * T0.W, T0.Z, PS, 163; EG-NEXT: ADD_INT * T0.W, T0.Z, PS, 164; EG-NEXT: MULHI * T0.Z, T0.X, PV.W, 165; EG-NEXT: MULLO_INT * T0.W, PS, T0.Y, 166; EG-NEXT: SUB_INT * T0.W, T0.X, PS, 167; EG-NEXT: ADD_INT T1.Z, T0.Z, 1, 168; EG-NEXT: SETGE_UINT T1.W, PV.W, T0.Y, 169; EG-NEXT: SUB_INT * T2.W, PV.W, T0.Y, 170; EG-NEXT: CNDE_INT T0.W, PV.W, T0.W, PS, 171; EG-NEXT: CNDE_INT * T1.W, PV.W, T0.Z, PV.Z, 172; EG-NEXT: ADD_INT T2.W, PS, 1, 173; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.Y, 174; EG-NEXT: CNDE_INT T0.X, PS, T1.W, PV.W, 175; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 176; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 177 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 178 %a = load i32, ptr addrspace(1) %in 179 %b = load i32, ptr addrspace(1) %b_ptr 180 %result = udiv i32 %a, %b 181 store i32 %result, ptr addrspace(1) %out 182 ret void 183} 184 185define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { 186; SI-LABEL: s_udiv_i32: 187; SI: ; %bb.0: 188; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 189; SI-NEXT: s_mov_b32 s7, 0xf000 190; SI-NEXT: s_mov_b32 s6, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: v_cvt_f32_u32_e32 v0, s3 193; SI-NEXT: s_sub_i32 s4, 0, s3 194; SI-NEXT: s_mov_b32 s5, s1 195; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 196; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 197; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 198; SI-NEXT: v_mul_lo_u32 v1, s4, v0 199; SI-NEXT: s_mov_b32 s4, s0 200; SI-NEXT: v_mul_hi_u32 v1, v0, v1 201; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 202; SI-NEXT: v_mul_hi_u32 v0, s2, v0 203; SI-NEXT: v_readfirstlane_b32 s0, v0 204; SI-NEXT: s_mul_i32 s0, s0, s3 205; SI-NEXT: s_sub_i32 s0, s2, s0 206; SI-NEXT: s_sub_i32 s1, s0, s3 207; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 208; SI-NEXT: s_cmp_ge_u32 s0, s3 209; SI-NEXT: s_cselect_b64 vcc, -1, 0 210; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 211; SI-NEXT: s_cselect_b32 s0, s1, s0 212; SI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 213; SI-NEXT: s_cmp_ge_u32 s0, s3 214; SI-NEXT: s_cselect_b64 vcc, -1, 0 215; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 216; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 217; SI-NEXT: s_endpgm 218; 219; VI-LABEL: s_udiv_i32: 220; VI: ; %bb.0: 221; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 222; VI-NEXT: s_mov_b32 s7, 0xf000 223; VI-NEXT: s_mov_b32 s6, -1 224; VI-NEXT: s_waitcnt lgkmcnt(0) 225; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 226; VI-NEXT: s_sub_i32 s4, 0, s3 227; VI-NEXT: s_mov_b32 s5, s1 228; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 229; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 230; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 231; VI-NEXT: v_mul_lo_u32 v1, s4, v0 232; VI-NEXT: s_mov_b32 s4, s0 233; VI-NEXT: v_mul_hi_u32 v1, v0, v1 234; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 235; VI-NEXT: v_mul_hi_u32 v0, s2, v0 236; VI-NEXT: v_readfirstlane_b32 s0, v0 237; VI-NEXT: s_mul_i32 s0, s0, s3 238; VI-NEXT: s_sub_i32 s0, s2, s0 239; VI-NEXT: s_sub_i32 s1, s0, s3 240; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 241; VI-NEXT: s_cmp_ge_u32 s0, s3 242; VI-NEXT: s_cselect_b64 vcc, -1, 0 243; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 244; VI-NEXT: s_cselect_b32 s0, s1, s0 245; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 246; VI-NEXT: s_cmp_ge_u32 s0, s3 247; VI-NEXT: s_cselect_b64 vcc, -1, 0 248; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 249; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 250; VI-NEXT: s_endpgm 251; 252; GCN-LABEL: s_udiv_i32: 253; GCN: ; %bb.0: 254; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 255; GCN-NEXT: s_waitcnt lgkmcnt(0) 256; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 257; GCN-NEXT: s_sub_i32 s4, 0, s3 258; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 259; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 260; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 261; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 262; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 263; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 264; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 265; GCN-NEXT: v_readfirstlane_b32 s4, v0 266; GCN-NEXT: s_mul_i32 s4, s4, s3 267; GCN-NEXT: s_sub_i32 s2, s2, s4 268; GCN-NEXT: s_sub_i32 s4, s2, s3 269; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 270; GCN-NEXT: s_cmp_ge_u32 s2, s3 271; GCN-NEXT: s_cselect_b64 vcc, -1, 0 272; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 273; GCN-NEXT: s_cselect_b32 s2, s4, s2 274; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0 275; GCN-NEXT: s_cmp_ge_u32 s2, s3 276; GCN-NEXT: s_cselect_b64 vcc, -1, 0 277; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 278; GCN-NEXT: v_mov_b32_e32 v0, s0 279; GCN-NEXT: v_mov_b32_e32 v1, s1 280; GCN-NEXT: flat_store_dword v[0:1], v2 281; GCN-NEXT: s_endpgm 282; 283; GFX1030-LABEL: s_udiv_i32: 284; GFX1030: ; %bb.0: 285; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 286; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 287; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 288; GFX1030-NEXT: s_sub_i32 s5, 0, s3 289; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 290; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 291; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 292; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 293; GFX1030-NEXT: v_mov_b32_e32 v0, 0 294; GFX1030-NEXT: s_mul_i32 s5, s5, s4 295; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5 296; GFX1030-NEXT: s_add_i32 s4, s4, s5 297; GFX1030-NEXT: s_mul_hi_u32 s4, s2, s4 298; GFX1030-NEXT: s_mul_i32 s5, s4, s3 299; GFX1030-NEXT: s_sub_i32 s2, s2, s5 300; GFX1030-NEXT: s_add_i32 s5, s4, 1 301; GFX1030-NEXT: s_sub_i32 s6, s2, s3 302; GFX1030-NEXT: s_cmp_ge_u32 s2, s3 303; GFX1030-NEXT: s_cselect_b32 s4, s5, s4 304; GFX1030-NEXT: s_cselect_b32 s2, s6, s2 305; GFX1030-NEXT: s_add_i32 s5, s4, 1 306; GFX1030-NEXT: s_cmp_ge_u32 s2, s3 307; GFX1030-NEXT: s_cselect_b32 s2, s5, s4 308; GFX1030-NEXT: v_mov_b32_e32 v1, s2 309; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 310; GFX1030-NEXT: s_endpgm 311; 312; EG-LABEL: s_udiv_i32: 313; EG: ; %bb.0: 314; EG-NEXT: ALU 17, @4, KC0[CB0:0-32], KC1[] 315; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 316; EG-NEXT: CF_END 317; EG-NEXT: PAD 318; EG-NEXT: ALU clause starting at 4: 319; EG-NEXT: SUB_INT T0.W, 0.0, KC0[2].W, 320; EG-NEXT: RECIP_UINT * T0.X, KC0[2].W, 321; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS, 322; EG-NEXT: MULHI * T0.Y, T0.X, PS, 323; EG-NEXT: ADD_INT * T0.W, T0.X, PS, 324; EG-NEXT: MULHI * T0.X, KC0[2].Z, PV.W, 325; EG-NEXT: MULLO_INT * T0.Y, PS, KC0[2].W, 326; EG-NEXT: SUB_INT * T0.W, KC0[2].Z, PS, 327; EG-NEXT: SUB_INT T0.Z, PV.W, KC0[2].W, 328; EG-NEXT: SETGE_UINT T1.W, PV.W, KC0[2].W, 329; EG-NEXT: ADD_INT * T2.W, T0.X, 1, 330; EG-NEXT: CNDE_INT T2.W, PV.W, T0.X, PS, 331; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, PV.Z, 332; EG-NEXT: SETGE_UINT T0.W, PS, KC0[2].W, 333; EG-NEXT: ADD_INT * T1.W, PV.W, 1, 334; EG-NEXT: CNDE_INT T0.X, PV.W, T2.W, PS, 335; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 336; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 337 %result = udiv i32 %a, %b 338 store i32 %result, ptr addrspace(1) %out 339 ret void 340} 341 342 343; The code generated by udiv is long and complex and may frequently 344; change. The goal of this test is to make sure the ISel doesn't fail 345; when it gets a v4i32 udiv 346define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 347; SI-LABEL: udiv_v2i32: 348; SI: ; %bb.0: 349; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 350; SI-NEXT: s_mov_b32 s7, 0xf000 351; SI-NEXT: s_mov_b32 s6, -1 352; SI-NEXT: s_mov_b32 s10, s6 353; SI-NEXT: s_mov_b32 s11, s7 354; SI-NEXT: s_waitcnt lgkmcnt(0) 355; SI-NEXT: s_mov_b32 s8, s2 356; SI-NEXT: s_mov_b32 s9, s3 357; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 358; SI-NEXT: s_mov_b32 s4, s0 359; SI-NEXT: s_mov_b32 s5, s1 360; SI-NEXT: s_waitcnt vmcnt(0) 361; SI-NEXT: v_cvt_f32_u32_e32 v4, v2 362; SI-NEXT: v_cvt_f32_u32_e32 v5, v3 363; SI-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 364; SI-NEXT: v_rcp_iflag_f32_e32 v4, v4 365; SI-NEXT: v_rcp_iflag_f32_e32 v5, v5 366; SI-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 367; SI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 368; SI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 369; SI-NEXT: v_cvt_u32_f32_e32 v4, v4 370; SI-NEXT: v_cvt_u32_f32_e32 v5, v5 371; SI-NEXT: v_mul_lo_u32 v6, v6, v4 372; SI-NEXT: v_mul_lo_u32 v7, v7, v5 373; SI-NEXT: v_mul_hi_u32 v6, v4, v6 374; SI-NEXT: v_mul_hi_u32 v7, v5, v7 375; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 376; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7 377; SI-NEXT: v_mul_hi_u32 v4, v0, v4 378; SI-NEXT: v_mul_hi_u32 v5, v1, v5 379; SI-NEXT: v_mul_lo_u32 v6, v4, v2 380; SI-NEXT: v_mul_lo_u32 v8, v5, v3 381; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v4 382; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 383; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 384; SI-NEXT: v_add_i32_e32 v9, vcc, 1, v5 385; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 386; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 387; SI-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 388; SI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 389; SI-NEXT: v_sub_i32_e32 v7, vcc, v1, v3 390; SI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] 391; SI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 392; SI-NEXT: v_add_i32_e32 v6, vcc, 1, v4 393; SI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 394; SI-NEXT: v_add_i32_e32 v7, vcc, 1, v5 395; SI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 396; SI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 397; SI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 398; SI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 399; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 400; SI-NEXT: s_endpgm 401; 402; VI-LABEL: udiv_v2i32: 403; VI: ; %bb.0: 404; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 405; VI-NEXT: s_mov_b32 s7, 0xf000 406; VI-NEXT: s_mov_b32 s6, -1 407; VI-NEXT: s_mov_b32 s10, s6 408; VI-NEXT: s_mov_b32 s11, s7 409; VI-NEXT: s_waitcnt lgkmcnt(0) 410; VI-NEXT: s_mov_b32 s8, s2 411; VI-NEXT: s_mov_b32 s9, s3 412; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 413; VI-NEXT: s_mov_b32 s4, s0 414; VI-NEXT: s_mov_b32 s5, s1 415; VI-NEXT: s_waitcnt vmcnt(0) 416; VI-NEXT: v_cvt_f32_u32_e32 v4, v2 417; VI-NEXT: v_cvt_f32_u32_e32 v5, v3 418; VI-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 419; VI-NEXT: v_rcp_iflag_f32_e32 v4, v4 420; VI-NEXT: v_rcp_iflag_f32_e32 v5, v5 421; VI-NEXT: v_sub_u32_e32 v7, vcc, 0, v3 422; VI-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 423; VI-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 424; VI-NEXT: v_cvt_u32_f32_e32 v4, v4 425; VI-NEXT: v_cvt_u32_f32_e32 v5, v5 426; VI-NEXT: v_mul_lo_u32 v6, v6, v4 427; VI-NEXT: v_mul_lo_u32 v7, v7, v5 428; VI-NEXT: v_mul_hi_u32 v6, v4, v6 429; VI-NEXT: v_mul_hi_u32 v7, v5, v7 430; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 431; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v7 432; VI-NEXT: v_mul_hi_u32 v4, v0, v4 433; VI-NEXT: v_mul_hi_u32 v5, v1, v5 434; VI-NEXT: v_mul_lo_u32 v6, v4, v2 435; VI-NEXT: v_mul_lo_u32 v8, v5, v3 436; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v4 437; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v6 438; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v8 439; VI-NEXT: v_add_u32_e32 v9, vcc, 1, v5 440; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 441; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 442; VI-NEXT: v_sub_u32_e32 v6, vcc, v0, v2 443; VI-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1] 444; VI-NEXT: v_sub_u32_e32 v7, vcc, v1, v3 445; VI-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[2:3] 446; VI-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] 447; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v4 448; VI-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] 449; VI-NEXT: v_add_u32_e32 v7, vcc, 1, v5 450; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 451; VI-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc 452; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 453; VI-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc 454; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 455; VI-NEXT: s_endpgm 456; 457; GCN-LABEL: udiv_v2i32: 458; GCN: ; %bb.0: 459; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 460; GCN-NEXT: s_waitcnt lgkmcnt(0) 461; GCN-NEXT: v_mov_b32_e32 v0, s2 462; GCN-NEXT: v_mov_b32_e32 v1, s3 463; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 464; GCN-NEXT: s_waitcnt vmcnt(0) 465; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 466; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 467; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 468; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v5 469; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 470; GCN-NEXT: v_cvt_u32_f32_e32 v6, v4 471; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 472; GCN-NEXT: v_cvt_u32_f32_e32 v7, v5 473; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 474; GCN-NEXT: v_mul_lo_u32 v5, v4, v6 475; GCN-NEXT: v_sub_u32_e32 v4, vcc, 0, v3 476; GCN-NEXT: v_mul_lo_u32 v8, v4, v7 477; GCN-NEXT: v_mul_hi_u32 v9, v6, v5 478; GCN-NEXT: v_mov_b32_e32 v4, s0 479; GCN-NEXT: v_mov_b32_e32 v5, s1 480; GCN-NEXT: v_mul_hi_u32 v8, v7, v8 481; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v9 482; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 483; GCN-NEXT: v_add_u32_e32 v7, vcc, v7, v8 484; GCN-NEXT: v_mul_hi_u32 v7, v1, v7 485; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 486; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v6 487; GCN-NEXT: v_mul_lo_u32 v10, v7, v3 488; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 489; GCN-NEXT: v_add_u32_e32 v11, vcc, 1, v7 490; GCN-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 491; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 492; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 493; GCN-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 494; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] 495; GCN-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 496; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[2:3] 497; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] 498; GCN-NEXT: v_add_u32_e32 v8, vcc, 1, v6 499; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] 500; GCN-NEXT: v_add_u32_e32 v9, vcc, 1, v7 501; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 502; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc 503; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 504; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc 505; GCN-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 506; GCN-NEXT: s_endpgm 507; 508; GFX1030-LABEL: udiv_v2i32: 509; GFX1030: ; %bb.0: 510; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 511; GFX1030-NEXT: v_mov_b32_e32 v4, 0 512; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 513; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 514; GFX1030-NEXT: s_waitcnt vmcnt(0) 515; GFX1030-NEXT: v_readfirstlane_b32 s2, v2 516; GFX1030-NEXT: v_readfirstlane_b32 s3, v3 517; GFX1030-NEXT: v_readfirstlane_b32 s6, v0 518; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, s2 519; GFX1030-NEXT: v_cvt_f32_u32_e32 v3, s3 520; GFX1030-NEXT: s_sub_i32 s5, 0, s2 521; GFX1030-NEXT: v_rcp_iflag_f32_e32 v2, v2 522; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v3 523; GFX1030-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 524; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v3 525; GFX1030-NEXT: v_cvt_u32_f32_e32 v2, v2 526; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 527; GFX1030-NEXT: v_readfirstlane_b32 s4, v2 528; GFX1030-NEXT: v_readfirstlane_b32 s8, v0 529; GFX1030-NEXT: s_mul_i32 s5, s5, s4 530; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5 531; GFX1030-NEXT: s_add_i32 s4, s4, s5 532; GFX1030-NEXT: s_mul_hi_u32 s4, s6, s4 533; GFX1030-NEXT: s_mul_i32 s5, s4, s2 534; GFX1030-NEXT: s_sub_i32 s5, s6, s5 535; GFX1030-NEXT: s_add_i32 s6, s4, 1 536; GFX1030-NEXT: s_sub_i32 s7, s5, s2 537; GFX1030-NEXT: s_cmp_ge_u32 s5, s2 538; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 539; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 540; GFX1030-NEXT: s_add_i32 s6, s4, 1 541; GFX1030-NEXT: s_cmp_ge_u32 s5, s2 542; GFX1030-NEXT: v_readfirstlane_b32 s5, v1 543; GFX1030-NEXT: s_cselect_b32 s2, s6, s4 544; GFX1030-NEXT: s_sub_i32 s4, 0, s3 545; GFX1030-NEXT: v_mov_b32_e32 v0, s2 546; GFX1030-NEXT: s_mul_i32 s4, s4, s8 547; GFX1030-NEXT: s_mul_hi_u32 s4, s8, s4 548; GFX1030-NEXT: s_add_i32 s8, s8, s4 549; GFX1030-NEXT: s_mul_hi_u32 s4, s5, s8 550; GFX1030-NEXT: s_mul_i32 s6, s4, s3 551; GFX1030-NEXT: s_sub_i32 s5, s5, s6 552; GFX1030-NEXT: s_add_i32 s6, s4, 1 553; GFX1030-NEXT: s_sub_i32 s7, s5, s3 554; GFX1030-NEXT: s_cmp_ge_u32 s5, s3 555; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 556; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 557; GFX1030-NEXT: s_add_i32 s6, s4, 1 558; GFX1030-NEXT: s_cmp_ge_u32 s5, s3 559; GFX1030-NEXT: s_cselect_b32 s3, s6, s4 560; GFX1030-NEXT: v_mov_b32_e32 v1, s3 561; GFX1030-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 562; GFX1030-NEXT: s_endpgm 563; 564; EG-LABEL: udiv_v2i32: 565; EG: ; %bb.0: 566; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 567; EG-NEXT: TEX 0 @6 568; EG-NEXT: ALU 33, @9, KC0[CB0:0-32], KC1[] 569; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1 570; EG-NEXT: CF_END 571; EG-NEXT: PAD 572; EG-NEXT: Fetch clause starting at 6: 573; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 574; EG-NEXT: ALU clause starting at 8: 575; EG-NEXT: MOV * T0.X, KC0[2].Z, 576; EG-NEXT: ALU clause starting at 9: 577; EG-NEXT: SUB_INT T1.W, 0.0, T0.W, 578; EG-NEXT: RECIP_UINT * T1.X, T0.W, 579; EG-NEXT: MULLO_INT * T1.Y, PV.W, PS, 580; EG-NEXT: SUB_INT T1.W, 0.0, T0.Z, 581; EG-NEXT: RECIP_UINT * T1.Z, T0.Z, 582; EG-NEXT: MULLO_INT * T1.W, PV.W, PS, 583; EG-NEXT: MULHI * T1.W, T1.Z, PS, 584; EG-NEXT: ADD_INT T1.W, T1.Z, PS, 585; EG-NEXT: MULHI * T1.Y, T1.X, T1.Y, 586; EG-NEXT: ADD_INT T2.W, T1.X, PS, 587; EG-NEXT: MULHI * T1.X, T0.X, PV.W, 588; EG-NEXT: MULHI * T1.Y, T0.Y, PV.W, 589; EG-NEXT: MULLO_INT * T1.Z, PS, T0.W, 590; EG-NEXT: SUB_INT T1.W, T0.Y, PS, 591; EG-NEXT: MULLO_INT * T0.Y, T1.X, T0.Z, 592; EG-NEXT: SUB_INT T0.Y, T0.X, PS, 593; EG-NEXT: ADD_INT T1.Z, T1.Y, 1, 594; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, 595; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, 596; EG-NEXT: CNDE_INT T0.X, PV.W, T1.W, PS, 597; EG-NEXT: CNDE_INT T1.Y, PV.W, T1.Y, PV.Z, 598; EG-NEXT: ADD_INT T1.Z, T1.X, 1, 599; EG-NEXT: SETGE_UINT T1.W, PV.Y, T0.Z, 600; EG-NEXT: SUB_INT * T2.W, PV.Y, T0.Z, 601; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, PS, 602; EG-NEXT: CNDE_INT T1.Z, PV.W, T1.X, PV.Z, 603; EG-NEXT: ADD_INT T1.W, PV.Y, 1, 604; EG-NEXT: SETGE_UINT * T0.W, PV.X, T0.W, 605; EG-NEXT: CNDE_INT T1.Y, PS, T1.Y, PV.W, 606; EG-NEXT: ADD_INT T0.W, PV.Z, 1, 607; EG-NEXT: SETGE_UINT * T1.W, PV.Y, T0.Z, 608; EG-NEXT: CNDE_INT T1.X, PS, T1.Z, PV.W, 609; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 610; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 611 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 612 %a = load <2 x i32>, ptr addrspace(1) %in 613 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 614 %result = udiv <2 x i32> %a, %b 615 store <2 x i32> %result, ptr addrspace(1) %out 616 ret void 617} 618 619define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 620; SI-LABEL: udiv_v4i32: 621; SI: ; %bb.0: 622; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 623; SI-NEXT: s_mov_b32 s11, 0xf000 624; SI-NEXT: s_mov_b32 s10, -1 625; SI-NEXT: s_mov_b32 s6, s10 626; SI-NEXT: s_mov_b32 s7, s11 627; SI-NEXT: s_waitcnt lgkmcnt(0) 628; SI-NEXT: s_mov_b32 s4, s2 629; SI-NEXT: s_mov_b32 s5, s3 630; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 631; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 632; SI-NEXT: s_mov_b32 s8, s0 633; SI-NEXT: s_mov_b32 s9, s1 634; SI-NEXT: s_waitcnt vmcnt(1) 635; SI-NEXT: v_cvt_f32_u32_e32 v8, v0 636; SI-NEXT: v_cvt_f32_u32_e32 v10, v1 637; SI-NEXT: v_cvt_f32_u32_e32 v12, v2 638; SI-NEXT: v_cvt_f32_u32_e32 v14, v3 639; SI-NEXT: v_rcp_iflag_f32_e32 v8, v8 640; SI-NEXT: v_rcp_iflag_f32_e32 v10, v10 641; SI-NEXT: v_rcp_iflag_f32_e32 v12, v12 642; SI-NEXT: v_rcp_iflag_f32_e32 v14, v14 643; SI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 644; SI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 645; SI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 646; SI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 647; SI-NEXT: v_cvt_u32_f32_e32 v8, v8 648; SI-NEXT: v_cvt_u32_f32_e32 v10, v10 649; SI-NEXT: v_cvt_u32_f32_e32 v12, v12 650; SI-NEXT: v_cvt_u32_f32_e32 v14, v14 651; SI-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 652; SI-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 653; SI-NEXT: v_sub_i32_e32 v13, vcc, 0, v2 654; SI-NEXT: v_sub_i32_e32 v15, vcc, 0, v3 655; SI-NEXT: v_mul_lo_u32 v9, v9, v8 656; SI-NEXT: v_mul_lo_u32 v11, v11, v10 657; SI-NEXT: v_mul_lo_u32 v13, v13, v12 658; SI-NEXT: v_mul_lo_u32 v15, v15, v14 659; SI-NEXT: v_mul_hi_u32 v9, v8, v9 660; SI-NEXT: v_mul_hi_u32 v11, v10, v11 661; SI-NEXT: v_mul_hi_u32 v13, v12, v13 662; SI-NEXT: v_mul_hi_u32 v15, v14, v15 663; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 664; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v11 665; SI-NEXT: v_add_i32_e32 v10, vcc, v12, v13 666; SI-NEXT: v_add_i32_e32 v11, vcc, v14, v15 667; SI-NEXT: s_waitcnt vmcnt(0) 668; SI-NEXT: v_mul_hi_u32 v8, v4, v8 669; SI-NEXT: v_mul_hi_u32 v9, v5, v9 670; SI-NEXT: v_mul_hi_u32 v10, v6, v10 671; SI-NEXT: v_mul_hi_u32 v11, v7, v11 672; SI-NEXT: v_mul_lo_u32 v12, v8, v0 673; SI-NEXT: v_mul_lo_u32 v14, v9, v1 674; SI-NEXT: v_mul_lo_u32 v16, v10, v2 675; SI-NEXT: v_mul_lo_u32 v18, v11, v3 676; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 677; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v14 678; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v16 679; SI-NEXT: v_sub_i32_e32 v7, vcc, v7, v18 680; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v8 681; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v9 682; SI-NEXT: v_add_i32_e32 v17, vcc, 1, v10 683; SI-NEXT: v_add_i32_e32 v19, vcc, 1, v11 684; SI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 685; SI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 686; SI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 687; SI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 688; SI-NEXT: v_sub_i32_e32 v12, vcc, v4, v0 689; SI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] 690; SI-NEXT: v_sub_i32_e32 v13, vcc, v5, v1 691; SI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] 692; SI-NEXT: v_sub_i32_e32 v14, vcc, v6, v2 693; SI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] 694; SI-NEXT: v_sub_i32_e32 v15, vcc, v7, v3 695; SI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] 696; SI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] 697; SI-NEXT: v_add_i32_e32 v12, vcc, 1, v8 698; SI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] 699; SI-NEXT: v_add_i32_e32 v13, vcc, 1, v9 700; SI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] 701; SI-NEXT: v_add_i32_e32 v14, vcc, 1, v10 702; SI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] 703; SI-NEXT: v_add_i32_e32 v15, vcc, 1, v11 704; SI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 705; SI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc 706; SI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 707; SI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc 708; SI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 709; SI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc 710; SI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 711; SI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc 712; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 713; SI-NEXT: s_endpgm 714; 715; VI-LABEL: udiv_v4i32: 716; VI: ; %bb.0: 717; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 718; VI-NEXT: s_mov_b32 s11, 0xf000 719; VI-NEXT: s_mov_b32 s10, -1 720; VI-NEXT: s_mov_b32 s6, s10 721; VI-NEXT: s_mov_b32 s7, s11 722; VI-NEXT: s_waitcnt lgkmcnt(0) 723; VI-NEXT: s_mov_b32 s4, s2 724; VI-NEXT: s_mov_b32 s5, s3 725; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 726; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 727; VI-NEXT: s_mov_b32 s8, s0 728; VI-NEXT: s_mov_b32 s9, s1 729; VI-NEXT: s_waitcnt vmcnt(1) 730; VI-NEXT: v_cvt_f32_u32_e32 v8, v0 731; VI-NEXT: v_cvt_f32_u32_e32 v10, v1 732; VI-NEXT: v_cvt_f32_u32_e32 v12, v2 733; VI-NEXT: v_cvt_f32_u32_e32 v14, v3 734; VI-NEXT: v_rcp_iflag_f32_e32 v8, v8 735; VI-NEXT: v_rcp_iflag_f32_e32 v10, v10 736; VI-NEXT: v_rcp_iflag_f32_e32 v12, v12 737; VI-NEXT: v_rcp_iflag_f32_e32 v14, v14 738; VI-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 739; VI-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 740; VI-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 741; VI-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 742; VI-NEXT: v_cvt_u32_f32_e32 v8, v8 743; VI-NEXT: v_cvt_u32_f32_e32 v10, v10 744; VI-NEXT: v_cvt_u32_f32_e32 v12, v12 745; VI-NEXT: v_cvt_u32_f32_e32 v14, v14 746; VI-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 747; VI-NEXT: v_sub_u32_e32 v11, vcc, 0, v1 748; VI-NEXT: v_sub_u32_e32 v13, vcc, 0, v2 749; VI-NEXT: v_sub_u32_e32 v15, vcc, 0, v3 750; VI-NEXT: v_mul_lo_u32 v9, v9, v8 751; VI-NEXT: v_mul_lo_u32 v11, v11, v10 752; VI-NEXT: v_mul_lo_u32 v13, v13, v12 753; VI-NEXT: v_mul_lo_u32 v15, v15, v14 754; VI-NEXT: v_mul_hi_u32 v9, v8, v9 755; VI-NEXT: v_mul_hi_u32 v11, v10, v11 756; VI-NEXT: v_mul_hi_u32 v13, v12, v13 757; VI-NEXT: v_mul_hi_u32 v15, v14, v15 758; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9 759; VI-NEXT: v_add_u32_e32 v9, vcc, v10, v11 760; VI-NEXT: v_add_u32_e32 v10, vcc, v12, v13 761; VI-NEXT: v_add_u32_e32 v11, vcc, v14, v15 762; VI-NEXT: s_waitcnt vmcnt(0) 763; VI-NEXT: v_mul_hi_u32 v8, v4, v8 764; VI-NEXT: v_mul_hi_u32 v9, v5, v9 765; VI-NEXT: v_mul_hi_u32 v10, v6, v10 766; VI-NEXT: v_mul_hi_u32 v11, v7, v11 767; VI-NEXT: v_mul_lo_u32 v12, v8, v0 768; VI-NEXT: v_mul_lo_u32 v14, v9, v1 769; VI-NEXT: v_mul_lo_u32 v16, v10, v2 770; VI-NEXT: v_mul_lo_u32 v18, v11, v3 771; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 772; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v14 773; VI-NEXT: v_sub_u32_e32 v6, vcc, v6, v16 774; VI-NEXT: v_sub_u32_e32 v7, vcc, v7, v18 775; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v8 776; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v9 777; VI-NEXT: v_add_u32_e32 v17, vcc, 1, v10 778; VI-NEXT: v_add_u32_e32 v19, vcc, 1, v11 779; VI-NEXT: v_cmp_ge_u32_e64 s[0:1], v4, v0 780; VI-NEXT: v_cmp_ge_u32_e64 s[2:3], v5, v1 781; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 782; VI-NEXT: v_cmp_ge_u32_e64 s[6:7], v7, v3 783; VI-NEXT: v_sub_u32_e32 v12, vcc, v4, v0 784; VI-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[0:1] 785; VI-NEXT: v_sub_u32_e32 v13, vcc, v5, v1 786; VI-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[2:3] 787; VI-NEXT: v_sub_u32_e32 v14, vcc, v6, v2 788; VI-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[4:5] 789; VI-NEXT: v_sub_u32_e32 v15, vcc, v7, v3 790; VI-NEXT: v_cndmask_b32_e64 v11, v11, v19, s[6:7] 791; VI-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] 792; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v8 793; VI-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[2:3] 794; VI-NEXT: v_add_u32_e32 v13, vcc, 1, v9 795; VI-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] 796; VI-NEXT: v_add_u32_e32 v14, vcc, 1, v10 797; VI-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[6:7] 798; VI-NEXT: v_add_u32_e32 v15, vcc, 1, v11 799; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 800; VI-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc 801; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 802; VI-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc 803; VI-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 804; VI-NEXT: v_cndmask_b32_e32 v2, v10, v14, vcc 805; VI-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 806; VI-NEXT: v_cndmask_b32_e32 v3, v11, v15, vcc 807; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 808; VI-NEXT: s_endpgm 809; 810; GCN-LABEL: udiv_v4i32: 811; GCN: ; %bb.0: 812; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 813; GCN-NEXT: s_waitcnt lgkmcnt(0) 814; GCN-NEXT: s_add_u32 s4, s2, 16 815; GCN-NEXT: s_addc_u32 s5, s3, 0 816; GCN-NEXT: v_mov_b32_e32 v0, s4 817; GCN-NEXT: v_mov_b32_e32 v1, s5 818; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 819; GCN-NEXT: v_mov_b32_e32 v5, s3 820; GCN-NEXT: v_mov_b32_e32 v4, s2 821; GCN-NEXT: flat_load_dwordx4 v[6:9], v[4:5] 822; GCN-NEXT: v_mov_b32_e32 v4, s0 823; GCN-NEXT: v_mov_b32_e32 v5, s1 824; GCN-NEXT: s_waitcnt vmcnt(1) 825; GCN-NEXT: v_cvt_f32_u32_e32 v10, v0 826; GCN-NEXT: v_cvt_f32_u32_e32 v12, v1 827; GCN-NEXT: v_cvt_f32_u32_e32 v14, v2 828; GCN-NEXT: v_cvt_f32_u32_e32 v16, v3 829; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 830; GCN-NEXT: v_rcp_iflag_f32_e32 v12, v12 831; GCN-NEXT: v_rcp_iflag_f32_e32 v14, v14 832; GCN-NEXT: v_rcp_iflag_f32_e32 v16, v16 833; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 834; GCN-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 835; GCN-NEXT: v_mul_f32_e32 v14, 0x4f7ffffe, v14 836; GCN-NEXT: v_mul_f32_e32 v16, 0x4f7ffffe, v16 837; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 838; GCN-NEXT: v_cvt_u32_f32_e32 v12, v12 839; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 840; GCN-NEXT: v_cvt_u32_f32_e32 v16, v16 841; GCN-NEXT: v_sub_u32_e32 v11, vcc, 0, v0 842; GCN-NEXT: v_sub_u32_e32 v13, vcc, 0, v1 843; GCN-NEXT: v_sub_u32_e32 v15, vcc, 0, v2 844; GCN-NEXT: v_sub_u32_e32 v17, vcc, 0, v3 845; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 846; GCN-NEXT: v_mul_lo_u32 v13, v13, v12 847; GCN-NEXT: v_mul_lo_u32 v15, v15, v14 848; GCN-NEXT: v_mul_lo_u32 v17, v17, v16 849; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 850; GCN-NEXT: v_mul_hi_u32 v13, v12, v13 851; GCN-NEXT: v_mul_hi_u32 v15, v14, v15 852; GCN-NEXT: v_mul_hi_u32 v17, v16, v17 853; GCN-NEXT: v_add_u32_e32 v10, vcc, v10, v11 854; GCN-NEXT: v_add_u32_e32 v11, vcc, v12, v13 855; GCN-NEXT: v_add_u32_e32 v12, vcc, v14, v15 856; GCN-NEXT: v_add_u32_e32 v13, vcc, v16, v17 857; GCN-NEXT: s_waitcnt vmcnt(0) 858; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 859; GCN-NEXT: v_mul_hi_u32 v11, v7, v11 860; GCN-NEXT: v_mul_hi_u32 v12, v8, v12 861; GCN-NEXT: v_mul_hi_u32 v13, v9, v13 862; GCN-NEXT: v_mul_lo_u32 v14, v10, v0 863; GCN-NEXT: v_mul_lo_u32 v16, v11, v1 864; GCN-NEXT: v_mul_lo_u32 v18, v12, v2 865; GCN-NEXT: v_mul_lo_u32 v19, v13, v3 866; GCN-NEXT: v_sub_u32_e32 v6, vcc, v6, v14 867; GCN-NEXT: v_sub_u32_e32 v7, vcc, v7, v16 868; GCN-NEXT: v_sub_u32_e32 v8, vcc, v8, v18 869; GCN-NEXT: v_sub_u32_e32 v9, vcc, v9, v19 870; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v10 871; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v11 872; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v12 873; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v13 874; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v6, v0 875; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v7, v1 876; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 877; GCN-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v3 878; GCN-NEXT: v_sub_u32_e32 v18, vcc, v6, v0 879; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v15, s[0:1] 880; GCN-NEXT: v_sub_u32_e32 v15, vcc, v7, v1 881; GCN-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[2:3] 882; GCN-NEXT: v_sub_u32_e32 v17, vcc, v8, v2 883; GCN-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] 884; GCN-NEXT: v_sub_u32_e32 v14, vcc, v9, v3 885; GCN-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[6:7] 886; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v18, s[0:1] 887; GCN-NEXT: v_add_u32_e32 v16, vcc, 1, v10 888; GCN-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[2:3] 889; GCN-NEXT: v_add_u32_e32 v15, vcc, 1, v11 890; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[4:5] 891; GCN-NEXT: v_add_u32_e32 v17, vcc, 1, v12 892; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7] 893; GCN-NEXT: v_add_u32_e32 v14, vcc, 1, v13 894; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0 895; GCN-NEXT: v_cndmask_b32_e32 v0, v10, v16, vcc 896; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v1 897; GCN-NEXT: v_cndmask_b32_e32 v1, v11, v15, vcc 898; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 899; GCN-NEXT: v_cndmask_b32_e32 v2, v12, v17, vcc 900; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 901; GCN-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc 902; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 903; GCN-NEXT: s_endpgm 904; 905; GFX1030-LABEL: udiv_v4i32: 906; GFX1030: ; %bb.0: 907; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 908; GFX1030-NEXT: v_mov_b32_e32 v8, 0 909; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 910; GFX1030-NEXT: s_clause 0x1 911; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 912; GFX1030-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] 913; GFX1030-NEXT: s_waitcnt vmcnt(1) 914; GFX1030-NEXT: v_readfirstlane_b32 s2, v0 915; GFX1030-NEXT: v_readfirstlane_b32 s3, v1 916; GFX1030-NEXT: s_waitcnt vmcnt(0) 917; GFX1030-NEXT: v_readfirstlane_b32 s7, v4 918; GFX1030-NEXT: v_readfirstlane_b32 s5, v2 919; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s2 920; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s3 921; GFX1030-NEXT: s_sub_i32 s6, 0, s2 922; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 923; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 924; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 925; GFX1030-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 926; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 927; GFX1030-NEXT: v_cvt_u32_f32_e32 v1, v1 928; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 929; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s5 930; GFX1030-NEXT: v_readfirstlane_b32 s9, v1 931; GFX1030-NEXT: s_mul_i32 s6, s6, s4 932; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 933; GFX1030-NEXT: s_mul_hi_u32 s6, s4, s6 934; GFX1030-NEXT: s_add_i32 s4, s4, s6 935; GFX1030-NEXT: s_mul_hi_u32 s4, s7, s4 936; GFX1030-NEXT: s_mul_i32 s6, s4, s2 937; GFX1030-NEXT: s_sub_i32 s6, s7, s6 938; GFX1030-NEXT: s_add_i32 s7, s4, 1 939; GFX1030-NEXT: s_sub_i32 s8, s6, s2 940; GFX1030-NEXT: s_cmp_ge_u32 s6, s2 941; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 942; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 943; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 944; GFX1030-NEXT: s_add_i32 s7, s4, 1 945; GFX1030-NEXT: s_cmp_ge_u32 s6, s2 946; GFX1030-NEXT: v_readfirstlane_b32 s2, v3 947; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 948; GFX1030-NEXT: s_sub_i32 s6, 0, s3 949; GFX1030-NEXT: v_readfirstlane_b32 s7, v5 950; GFX1030-NEXT: s_mul_i32 s6, s6, s9 951; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 952; GFX1030-NEXT: s_mul_hi_u32 s6, s9, s6 953; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, s2 954; GFX1030-NEXT: s_add_i32 s9, s9, s6 955; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s9 956; GFX1030-NEXT: v_readfirstlane_b32 s10, v0 957; GFX1030-NEXT: s_mul_i32 s8, s6, s3 958; GFX1030-NEXT: v_rcp_iflag_f32_e32 v1, v1 959; GFX1030-NEXT: s_sub_i32 s7, s7, s8 960; GFX1030-NEXT: s_add_i32 s8, s6, 1 961; GFX1030-NEXT: s_sub_i32 s9, s7, s3 962; GFX1030-NEXT: s_cmp_ge_u32 s7, s3 963; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 964; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 965; GFX1030-NEXT: s_add_i32 s8, s6, 1 966; GFX1030-NEXT: s_cmp_ge_u32 s7, s3 967; GFX1030-NEXT: v_readfirstlane_b32 s7, v6 968; GFX1030-NEXT: s_cselect_b32 s3, s8, s6 969; GFX1030-NEXT: s_sub_i32 s6, 0, s5 970; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v1 971; GFX1030-NEXT: s_mul_i32 s6, s6, s10 972; GFX1030-NEXT: v_mov_b32_e32 v1, s3 973; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6 974; GFX1030-NEXT: s_add_i32 s10, s10, s6 975; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 976; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10 977; GFX1030-NEXT: s_mul_i32 s8, s6, s5 978; GFX1030-NEXT: s_sub_i32 s7, s7, s8 979; GFX1030-NEXT: s_add_i32 s8, s6, 1 980; GFX1030-NEXT: s_sub_i32 s9, s7, s5 981; GFX1030-NEXT: s_cmp_ge_u32 s7, s5 982; GFX1030-NEXT: v_readfirstlane_b32 s10, v0 983; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 984; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 985; GFX1030-NEXT: s_add_i32 s8, s6, 1 986; GFX1030-NEXT: s_cmp_ge_u32 s7, s5 987; GFX1030-NEXT: v_readfirstlane_b32 s7, v7 988; GFX1030-NEXT: s_cselect_b32 s5, s8, s6 989; GFX1030-NEXT: s_sub_i32 s6, 0, s2 990; GFX1030-NEXT: v_mov_b32_e32 v0, s4 991; GFX1030-NEXT: s_mul_i32 s6, s6, s10 992; GFX1030-NEXT: v_mov_b32_e32 v2, s5 993; GFX1030-NEXT: s_mul_hi_u32 s6, s10, s6 994; GFX1030-NEXT: s_add_i32 s10, s10, s6 995; GFX1030-NEXT: s_mul_hi_u32 s6, s7, s10 996; GFX1030-NEXT: s_mul_i32 s8, s6, s2 997; GFX1030-NEXT: s_sub_i32 s7, s7, s8 998; GFX1030-NEXT: s_add_i32 s8, s6, 1 999; GFX1030-NEXT: s_sub_i32 s9, s7, s2 1000; GFX1030-NEXT: s_cmp_ge_u32 s7, s2 1001; GFX1030-NEXT: s_cselect_b32 s6, s8, s6 1002; GFX1030-NEXT: s_cselect_b32 s7, s9, s7 1003; GFX1030-NEXT: s_add_i32 s8, s6, 1 1004; GFX1030-NEXT: s_cmp_ge_u32 s7, s2 1005; GFX1030-NEXT: s_cselect_b32 s2, s8, s6 1006; GFX1030-NEXT: v_mov_b32_e32 v3, s2 1007; GFX1030-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 1008; GFX1030-NEXT: s_endpgm 1009; 1010; EG-LABEL: udiv_v4i32: 1011; EG: ; %bb.0: 1012; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1013; EG-NEXT: TEX 1 @6 1014; EG-NEXT: ALU 65, @11, KC0[CB0:0-32], KC1[] 1015; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1 1016; EG-NEXT: CF_END 1017; EG-NEXT: PAD 1018; EG-NEXT: Fetch clause starting at 6: 1019; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 1020; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 1021; EG-NEXT: ALU clause starting at 10: 1022; EG-NEXT: MOV * T0.X, KC0[2].Z, 1023; EG-NEXT: ALU clause starting at 11: 1024; EG-NEXT: SUB_INT T2.W, 0.0, T1.W, 1025; EG-NEXT: RECIP_UINT * T2.X, T1.W, 1026; EG-NEXT: MULLO_INT * T2.Y, PV.W, PS, 1027; EG-NEXT: MULHI * T2.Y, T2.X, PS, 1028; EG-NEXT: ADD_INT * T2.W, T2.X, PS, 1029; EG-NEXT: MULHI * T2.X, T0.W, PV.W, 1030; EG-NEXT: MULLO_INT * T2.Y, PS, T1.W, 1031; EG-NEXT: SUB_INT T2.W, 0.0, T1.X, 1032; EG-NEXT: RECIP_UINT * T2.Z, T1.X, 1033; EG-NEXT: MULLO_INT * T2.W, PV.W, PS, 1034; EG-NEXT: SUB_INT T3.W, 0.0, T1.Y, 1035; EG-NEXT: RECIP_UINT * T3.X, T1.Y, 1036; EG-NEXT: MULLO_INT * T3.Y, PV.W, PS, 1037; EG-NEXT: SUB_INT T3.W, 0.0, T1.Z, 1038; EG-NEXT: RECIP_UINT * T3.Z, T1.Z, 1039; EG-NEXT: MULLO_INT * T3.W, PV.W, PS, 1040; EG-NEXT: MULHI * T3.W, T3.Z, PS, 1041; EG-NEXT: ADD_INT T3.W, T3.Z, PS, 1042; EG-NEXT: MULHI * T3.Y, T3.X, T3.Y, 1043; EG-NEXT: ADD_INT T4.W, T3.X, PS, 1044; EG-NEXT: MULHI * T3.X, T0.Z, PV.W, 1045; EG-NEXT: MULHI * T3.Y, T0.Y, PV.W, 1046; EG-NEXT: MULLO_INT * T3.Z, PS, T1.Y, 1047; EG-NEXT: SUB_INT T3.W, T0.Y, PS, 1048; EG-NEXT: MULLO_INT * T0.Y, T3.X, T1.Z, 1049; EG-NEXT: SUB_INT T4.X, T0.Z, PS, 1050; EG-NEXT: ADD_INT T0.Y, T3.Y, 1, 1051; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.Y, 1052; EG-NEXT: SUB_INT T4.W, PV.W, T1.Y, 1053; EG-NEXT: MULHI * T2.W, T2.Z, T2.W, 1054; EG-NEXT: CNDE_INT T5.X, PV.Z, T3.W, PV.W, 1055; EG-NEXT: CNDE_INT T0.Y, PV.Z, T3.Y, PV.Y, BS:VEC_021/SCL_122 1056; EG-NEXT: SETGE_UINT T0.Z, PV.X, T1.Z, 1057; EG-NEXT: ADD_INT T2.W, T2.Z, PS, 1058; EG-NEXT: SUB_INT * T0.W, T0.W, T2.Y, 1059; EG-NEXT: ADD_INT T6.X, T3.X, 1, 1060; EG-NEXT: ADD_INT T2.Y, T2.X, 1, BS:VEC_120/SCL_212 1061; EG-NEXT: SETGE_UINT T2.Z, PS, T1.W, 1062; EG-NEXT: SUB_INT T3.W, PS, T1.W, 1063; EG-NEXT: MULHI * T2.W, T0.X, PV.W, 1064; EG-NEXT: SUB_INT T7.X, T4.X, T1.Z, 1065; EG-NEXT: CNDE_INT T3.Y, PV.Z, T0.W, PV.W, 1066; EG-NEXT: CNDE_INT T2.Z, PV.Z, T2.X, PV.Y, 1067; EG-NEXT: CNDE_INT * T0.W, T0.Z, T3.X, PV.X, BS:VEC_021/SCL_122 1068; EG-NEXT: MULLO_INT * T2.X, T2.W, T1.X, 1069; EG-NEXT: ADD_INT T3.X, T0.W, 1, 1070; EG-NEXT: ADD_INT T2.Y, T2.Z, 1, 1071; EG-NEXT: SETGE_UINT T3.Z, T3.Y, T1.W, 1072; EG-NEXT: SUB_INT T1.W, T0.X, PS, BS:VEC_201 1073; EG-NEXT: CNDE_INT * T3.W, T0.Z, T4.X, T7.X, 1074; EG-NEXT: SETGE_UINT T0.X, PS, T1.Z, BS:VEC_021/SCL_122 1075; EG-NEXT: ADD_INT T3.Y, T2.W, 1, 1076; EG-NEXT: SETGE_UINT T0.Z, PV.W, T1.X, 1077; EG-NEXT: SUB_INT T3.W, PV.W, T1.X, 1078; EG-NEXT: CNDE_INT * T4.W, PV.Z, T2.Z, PV.Y, 1079; EG-NEXT: CNDE_INT T2.X, PV.Z, T1.W, PV.W, 1080; EG-NEXT: CNDE_INT T2.Y, PV.Z, T2.W, PV.Y, BS:VEC_021/SCL_122 1081; EG-NEXT: CNDE_INT T4.Z, PV.X, T0.W, T3.X, BS:VEC_201 1082; EG-NEXT: ADD_INT T0.W, T0.Y, 1, 1083; EG-NEXT: SETGE_UINT * T1.W, T5.X, T1.Y, 1084; EG-NEXT: CNDE_INT T4.Y, PS, T0.Y, PV.W, 1085; EG-NEXT: ADD_INT T0.W, PV.Y, 1, 1086; EG-NEXT: SETGE_UINT * T1.W, PV.X, T1.X, 1087; EG-NEXT: CNDE_INT T4.X, PS, T2.Y, PV.W, 1088; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1089; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1090 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 1091 %a = load <4 x i32>, ptr addrspace(1) %in 1092 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 1093 %result = udiv <4 x i32> %a, %b 1094 store <4 x i32> %result, ptr addrspace(1) %out 1095 ret void 1096} 1097 1098define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1099; SI-LABEL: udiv_i32_div_pow2: 1100; SI: ; %bb.0: 1101; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1102; SI-NEXT: s_mov_b32 s7, 0xf000 1103; SI-NEXT: s_mov_b32 s6, -1 1104; SI-NEXT: s_mov_b32 s10, s6 1105; SI-NEXT: s_mov_b32 s11, s7 1106; SI-NEXT: s_waitcnt lgkmcnt(0) 1107; SI-NEXT: s_mov_b32 s8, s2 1108; SI-NEXT: s_mov_b32 s9, s3 1109; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1110; SI-NEXT: s_mov_b32 s4, s0 1111; SI-NEXT: s_mov_b32 s5, s1 1112; SI-NEXT: s_waitcnt vmcnt(0) 1113; SI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 1114; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1115; SI-NEXT: s_endpgm 1116; 1117; VI-LABEL: udiv_i32_div_pow2: 1118; VI: ; %bb.0: 1119; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1120; VI-NEXT: s_mov_b32 s7, 0xf000 1121; VI-NEXT: s_mov_b32 s6, -1 1122; VI-NEXT: s_mov_b32 s10, s6 1123; VI-NEXT: s_mov_b32 s11, s7 1124; VI-NEXT: s_waitcnt lgkmcnt(0) 1125; VI-NEXT: s_mov_b32 s8, s2 1126; VI-NEXT: s_mov_b32 s9, s3 1127; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1128; VI-NEXT: s_mov_b32 s4, s0 1129; VI-NEXT: s_mov_b32 s5, s1 1130; VI-NEXT: s_waitcnt vmcnt(0) 1131; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0 1132; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1133; VI-NEXT: s_endpgm 1134; 1135; GCN-LABEL: udiv_i32_div_pow2: 1136; GCN: ; %bb.0: 1137; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1138; GCN-NEXT: s_waitcnt lgkmcnt(0) 1139; GCN-NEXT: v_mov_b32_e32 v0, s2 1140; GCN-NEXT: v_mov_b32_e32 v1, s3 1141; GCN-NEXT: flat_load_dword v2, v[0:1] 1142; GCN-NEXT: v_mov_b32_e32 v0, s0 1143; GCN-NEXT: v_mov_b32_e32 v1, s1 1144; GCN-NEXT: s_waitcnt vmcnt(0) 1145; GCN-NEXT: v_lshrrev_b32_e32 v2, 4, v2 1146; GCN-NEXT: flat_store_dword v[0:1], v2 1147; GCN-NEXT: s_endpgm 1148; 1149; GFX1030-LABEL: udiv_i32_div_pow2: 1150; GFX1030: ; %bb.0: 1151; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1152; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1153; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1155; GFX1030-NEXT: s_waitcnt vmcnt(0) 1156; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 4, v1 1157; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1158; GFX1030-NEXT: s_endpgm 1159; 1160; EG-LABEL: udiv_i32_div_pow2: 1161; EG: ; %bb.0: 1162; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1163; EG-NEXT: TEX 0 @6 1164; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 1165; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1166; EG-NEXT: CF_END 1167; EG-NEXT: PAD 1168; EG-NEXT: Fetch clause starting at 6: 1169; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1170; EG-NEXT: ALU clause starting at 8: 1171; EG-NEXT: MOV * T0.X, KC0[2].Z, 1172; EG-NEXT: ALU clause starting at 9: 1173; EG-NEXT: LSHR T0.X, T0.X, literal.x, 1174; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1175; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 1176 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1177 %a = load i32, ptr addrspace(1) %in 1178 %result = udiv i32 %a, 16 1179 store i32 %result, ptr addrspace(1) %out 1180 ret void 1181} 1182 1183define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1184; SI-LABEL: udiv_i32_div_k_even: 1185; SI: ; %bb.0: 1186; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1187; SI-NEXT: s_mov_b32 s7, 0xf000 1188; SI-NEXT: s_mov_b32 s6, -1 1189; SI-NEXT: s_mov_b32 s10, s6 1190; SI-NEXT: s_mov_b32 s11, s7 1191; SI-NEXT: s_waitcnt lgkmcnt(0) 1192; SI-NEXT: s_mov_b32 s8, s2 1193; SI-NEXT: s_mov_b32 s9, s3 1194; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1195; SI-NEXT: s_mov_b32 s2, 0xfabbd9c1 1196; SI-NEXT: s_mov_b32 s4, s0 1197; SI-NEXT: s_mov_b32 s5, s1 1198; SI-NEXT: s_waitcnt vmcnt(0) 1199; SI-NEXT: v_mul_hi_u32 v0, v0, s2 1200; SI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 1201; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1202; SI-NEXT: s_endpgm 1203; 1204; VI-LABEL: udiv_i32_div_k_even: 1205; VI: ; %bb.0: 1206; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1207; VI-NEXT: s_mov_b32 s7, 0xf000 1208; VI-NEXT: s_mov_b32 s6, -1 1209; VI-NEXT: s_mov_b32 s10, s6 1210; VI-NEXT: s_mov_b32 s11, s7 1211; VI-NEXT: s_waitcnt lgkmcnt(0) 1212; VI-NEXT: s_mov_b32 s8, s2 1213; VI-NEXT: s_mov_b32 s9, s3 1214; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1215; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1 1216; VI-NEXT: s_mov_b32 s4, s0 1217; VI-NEXT: s_mov_b32 s5, s1 1218; VI-NEXT: s_waitcnt vmcnt(0) 1219; VI-NEXT: v_mul_hi_u32 v0, v0, s2 1220; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0 1221; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1222; VI-NEXT: s_endpgm 1223; 1224; GCN-LABEL: udiv_i32_div_k_even: 1225; GCN: ; %bb.0: 1226; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1227; GCN-NEXT: s_waitcnt lgkmcnt(0) 1228; GCN-NEXT: v_mov_b32_e32 v0, s2 1229; GCN-NEXT: v_mov_b32_e32 v1, s3 1230; GCN-NEXT: flat_load_dword v0, v[0:1] 1231; GCN-NEXT: s_mov_b32 s2, 0xfabbd9c1 1232; GCN-NEXT: v_mov_b32_e32 v1, s1 1233; GCN-NEXT: s_waitcnt vmcnt(0) 1234; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 1235; GCN-NEXT: v_mov_b32_e32 v0, s0 1236; GCN-NEXT: v_lshrrev_b32_e32 v2, 25, v2 1237; GCN-NEXT: flat_store_dword v[0:1], v2 1238; GCN-NEXT: s_endpgm 1239; 1240; GFX1030-LABEL: udiv_i32_div_k_even: 1241; GFX1030: ; %bb.0: 1242; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1243; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1244; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1246; GFX1030-NEXT: s_waitcnt vmcnt(0) 1247; GFX1030-NEXT: v_mul_hi_u32 v1, 0xfabbd9c1, v1 1248; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 25, v1 1249; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1250; GFX1030-NEXT: s_endpgm 1251; 1252; EG-LABEL: udiv_i32_div_k_even: 1253; EG: ; %bb.0: 1254; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1255; EG-NEXT: TEX 0 @6 1256; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1257; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1258; EG-NEXT: CF_END 1259; EG-NEXT: PAD 1260; EG-NEXT: Fetch clause starting at 6: 1261; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1262; EG-NEXT: ALU clause starting at 8: 1263; EG-NEXT: MOV * T0.X, KC0[2].Z, 1264; EG-NEXT: ALU clause starting at 9: 1265; EG-NEXT: MULHI * T0.X, T0.X, literal.x, 1266; EG-NEXT: -88352319(-4.876880e+35), 0(0.000000e+00) 1267; EG-NEXT: LSHR T0.X, PS, literal.x, 1268; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1269; EG-NEXT: 25(3.503246e-44), 2(2.802597e-45) 1270 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1271 %a = load i32, ptr addrspace(1) %in 1272 %result = udiv i32 %a, 34259182 1273 store i32 %result, ptr addrspace(1) %out 1274 ret void 1275} 1276 1277define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1278; SI-LABEL: udiv_i32_div_k_odd: 1279; SI: ; %bb.0: 1280; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1281; SI-NEXT: s_mov_b32 s7, 0xf000 1282; SI-NEXT: s_mov_b32 s6, -1 1283; SI-NEXT: s_mov_b32 s10, s6 1284; SI-NEXT: s_mov_b32 s11, s7 1285; SI-NEXT: s_waitcnt lgkmcnt(0) 1286; SI-NEXT: s_mov_b32 s8, s2 1287; SI-NEXT: s_mov_b32 s9, s3 1288; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1289; SI-NEXT: s_mov_b32 s2, 0x7d5deca3 1290; SI-NEXT: s_mov_b32 s4, s0 1291; SI-NEXT: s_mov_b32 s5, s1 1292; SI-NEXT: s_waitcnt vmcnt(0) 1293; SI-NEXT: v_mul_hi_u32 v0, v0, s2 1294; SI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1295; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1296; SI-NEXT: s_endpgm 1297; 1298; VI-LABEL: udiv_i32_div_k_odd: 1299; VI: ; %bb.0: 1300; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1301; VI-NEXT: s_mov_b32 s7, 0xf000 1302; VI-NEXT: s_mov_b32 s6, -1 1303; VI-NEXT: s_mov_b32 s10, s6 1304; VI-NEXT: s_mov_b32 s11, s7 1305; VI-NEXT: s_waitcnt lgkmcnt(0) 1306; VI-NEXT: s_mov_b32 s8, s2 1307; VI-NEXT: s_mov_b32 s9, s3 1308; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1309; VI-NEXT: s_mov_b32 s2, 0x7d5deca3 1310; VI-NEXT: s_mov_b32 s4, s0 1311; VI-NEXT: s_mov_b32 s5, s1 1312; VI-NEXT: s_waitcnt vmcnt(0) 1313; VI-NEXT: v_mul_hi_u32 v0, v0, s2 1314; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0 1315; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1316; VI-NEXT: s_endpgm 1317; 1318; GCN-LABEL: udiv_i32_div_k_odd: 1319; GCN: ; %bb.0: 1320; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1321; GCN-NEXT: s_waitcnt lgkmcnt(0) 1322; GCN-NEXT: v_mov_b32_e32 v0, s2 1323; GCN-NEXT: v_mov_b32_e32 v1, s3 1324; GCN-NEXT: flat_load_dword v0, v[0:1] 1325; GCN-NEXT: s_mov_b32 s2, 0x7d5deca3 1326; GCN-NEXT: v_mov_b32_e32 v1, s1 1327; GCN-NEXT: s_waitcnt vmcnt(0) 1328; GCN-NEXT: v_mul_hi_u32 v2, v0, s2 1329; GCN-NEXT: v_mov_b32_e32 v0, s0 1330; GCN-NEXT: v_lshrrev_b32_e32 v2, 24, v2 1331; GCN-NEXT: flat_store_dword v[0:1], v2 1332; GCN-NEXT: s_endpgm 1333; 1334; GFX1030-LABEL: udiv_i32_div_k_odd: 1335; GFX1030: ; %bb.0: 1336; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1337; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1338; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1339; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1340; GFX1030-NEXT: s_waitcnt vmcnt(0) 1341; GFX1030-NEXT: v_mul_hi_u32 v1, 0x7d5deca3, v1 1342; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 24, v1 1343; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1344; GFX1030-NEXT: s_endpgm 1345; 1346; EG-LABEL: udiv_i32_div_k_odd: 1347; EG: ; %bb.0: 1348; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1349; EG-NEXT: TEX 0 @6 1350; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] 1351; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1352; EG-NEXT: CF_END 1353; EG-NEXT: PAD 1354; EG-NEXT: Fetch clause starting at 6: 1355; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1356; EG-NEXT: ALU clause starting at 8: 1357; EG-NEXT: MOV * T0.X, KC0[2].Z, 1358; EG-NEXT: ALU clause starting at 9: 1359; EG-NEXT: MULHI * T0.X, T0.X, literal.x, 1360; EG-NEXT: 2103307427(1.843675e+37), 0(0.000000e+00) 1361; EG-NEXT: LSHR T0.X, PS, literal.x, 1362; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1363; EG-NEXT: 24(3.363116e-44), 2(2.802597e-45) 1364 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 1365 %a = load i32, ptr addrspace(1) %in 1366 %result = udiv i32 %a, 34259183 1367 store i32 %result, ptr addrspace(1) %out 1368 ret void 1369} 1370 1371define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1372; SI-LABEL: v_udiv_i8: 1373; SI: ; %bb.0: 1374; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1375; SI-NEXT: s_mov_b32 s7, 0xf000 1376; SI-NEXT: s_mov_b32 s6, -1 1377; SI-NEXT: s_mov_b32 s10, s6 1378; SI-NEXT: s_mov_b32 s11, s7 1379; SI-NEXT: s_waitcnt lgkmcnt(0) 1380; SI-NEXT: s_mov_b32 s8, s2 1381; SI-NEXT: s_mov_b32 s9, s3 1382; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1383; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 1384; SI-NEXT: s_mov_b32 s4, s0 1385; SI-NEXT: s_mov_b32 s5, s1 1386; SI-NEXT: s_waitcnt vmcnt(1) 1387; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1388; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1389; SI-NEXT: s_waitcnt vmcnt(0) 1390; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1391; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1392; SI-NEXT: v_trunc_f32_e32 v2, v2 1393; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1394; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1395; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1396; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1397; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 1398; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1399; SI-NEXT: s_endpgm 1400; 1401; VI-LABEL: v_udiv_i8: 1402; VI: ; %bb.0: 1403; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1404; VI-NEXT: s_mov_b32 s7, 0xf000 1405; VI-NEXT: s_mov_b32 s6, -1 1406; VI-NEXT: s_mov_b32 s10, s6 1407; VI-NEXT: s_mov_b32 s11, s7 1408; VI-NEXT: s_waitcnt lgkmcnt(0) 1409; VI-NEXT: s_mov_b32 s8, s2 1410; VI-NEXT: s_mov_b32 s9, s3 1411; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1412; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 1413; VI-NEXT: s_mov_b32 s4, s0 1414; VI-NEXT: s_mov_b32 s5, s1 1415; VI-NEXT: s_waitcnt vmcnt(1) 1416; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 1417; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1418; VI-NEXT: s_waitcnt vmcnt(0) 1419; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1420; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1421; VI-NEXT: v_trunc_f32_e32 v2, v2 1422; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1423; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1424; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1425; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1426; VI-NEXT: v_and_b32_e32 v0, 0xff, v0 1427; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1428; VI-NEXT: s_endpgm 1429; 1430; GCN-LABEL: v_udiv_i8: 1431; GCN: ; %bb.0: 1432; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1433; GCN-NEXT: s_waitcnt lgkmcnt(0) 1434; GCN-NEXT: v_mov_b32_e32 v0, s2 1435; GCN-NEXT: v_mov_b32_e32 v1, s3 1436; GCN-NEXT: flat_load_ushort v2, v[0:1] 1437; GCN-NEXT: v_mov_b32_e32 v0, s0 1438; GCN-NEXT: v_mov_b32_e32 v1, s1 1439; GCN-NEXT: s_waitcnt vmcnt(0) 1440; GCN-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 1441; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v3 1442; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 1443; GCN-NEXT: v_mul_f32_e32 v4, v2, v4 1444; GCN-NEXT: v_trunc_f32_e32 v4, v4 1445; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1446; GCN-NEXT: v_mad_f32 v2, -v4, v3, v2 1447; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 1448; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1449; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 1450; GCN-NEXT: flat_store_dword v[0:1], v2 1451; GCN-NEXT: s_endpgm 1452; 1453; GFX1030-LABEL: v_udiv_i8: 1454; GFX1030: ; %bb.0: 1455; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1456; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1457; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1458; GFX1030-NEXT: global_load_ushort v1, v0, s[2:3] 1459; GFX1030-NEXT: s_waitcnt vmcnt(0) 1460; GFX1030-NEXT: v_cvt_f32_ubyte1_e32 v2, v1 1461; GFX1030-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 1462; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 1463; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 1464; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1465; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 1466; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1467; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2 1468; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1469; GFX1030-NEXT: v_and_b32_e32 v1, 0xff, v1 1470; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1471; GFX1030-NEXT: s_endpgm 1472; 1473; EG-LABEL: v_udiv_i8: 1474; EG: ; %bb.0: 1475; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1476; EG-NEXT: TEX 1 @6 1477; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 1478; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1479; EG-NEXT: CF_END 1480; EG-NEXT: PAD 1481; EG-NEXT: Fetch clause starting at 6: 1482; EG-NEXT: VTX_READ_8 T1.X, T0.X, 1, #1 1483; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1484; EG-NEXT: ALU clause starting at 10: 1485; EG-NEXT: MOV * T0.X, KC0[2].Z, 1486; EG-NEXT: ALU clause starting at 11: 1487; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, 1488; EG-NEXT: RECIP_IEEE * T0.Z, PS, 1489; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, 1490; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, 1491; EG-NEXT: TRUNC * T0.W, PV.W, 1492; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, 1493; EG-NEXT: TRUNC * T0.W, PV.W, 1494; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, 1495; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1496; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1497; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1498; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1499; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1500; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1501; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) 1502 %den_ptr = getelementptr i8, ptr addrspace(1) %in, i8 1 1503 %num = load i8, ptr addrspace(1) %in 1504 %den = load i8, ptr addrspace(1) %den_ptr 1505 %result = udiv i8 %num, %den 1506 %result.ext = zext i8 %result to i32 1507 store i32 %result.ext, ptr addrspace(1) %out 1508 ret void 1509} 1510 1511define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1512; SI-LABEL: v_udiv_i16: 1513; SI: ; %bb.0: 1514; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1515; SI-NEXT: s_mov_b32 s7, 0xf000 1516; SI-NEXT: s_mov_b32 s6, -1 1517; SI-NEXT: s_mov_b32 s10, s6 1518; SI-NEXT: s_mov_b32 s11, s7 1519; SI-NEXT: s_waitcnt lgkmcnt(0) 1520; SI-NEXT: s_mov_b32 s8, s2 1521; SI-NEXT: s_mov_b32 s9, s3 1522; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1523; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 1524; SI-NEXT: s_mov_b32 s4, s0 1525; SI-NEXT: s_mov_b32 s5, s1 1526; SI-NEXT: s_waitcnt vmcnt(1) 1527; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1528; SI-NEXT: s_waitcnt vmcnt(0) 1529; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 1530; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1531; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1532; SI-NEXT: v_trunc_f32_e32 v2, v2 1533; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1534; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1535; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1536; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1537; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1538; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1539; SI-NEXT: s_endpgm 1540; 1541; VI-LABEL: v_udiv_i16: 1542; VI: ; %bb.0: 1543; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1544; VI-NEXT: s_mov_b32 s7, 0xf000 1545; VI-NEXT: s_mov_b32 s6, -1 1546; VI-NEXT: s_mov_b32 s10, s6 1547; VI-NEXT: s_mov_b32 s11, s7 1548; VI-NEXT: s_waitcnt lgkmcnt(0) 1549; VI-NEXT: s_mov_b32 s8, s2 1550; VI-NEXT: s_mov_b32 s9, s3 1551; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1552; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 1553; VI-NEXT: s_mov_b32 s4, s0 1554; VI-NEXT: s_mov_b32 s5, s1 1555; VI-NEXT: s_waitcnt vmcnt(1) 1556; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1557; VI-NEXT: s_waitcnt vmcnt(0) 1558; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 1559; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1560; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1561; VI-NEXT: v_trunc_f32_e32 v2, v2 1562; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1563; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1564; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1565; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1566; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1567; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1568; VI-NEXT: s_endpgm 1569; 1570; GCN-LABEL: v_udiv_i16: 1571; GCN: ; %bb.0: 1572; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1573; GCN-NEXT: s_waitcnt lgkmcnt(0) 1574; GCN-NEXT: v_mov_b32_e32 v0, s2 1575; GCN-NEXT: v_mov_b32_e32 v1, s3 1576; GCN-NEXT: flat_load_dword v0, v[0:1] 1577; GCN-NEXT: v_mov_b32_e32 v1, s1 1578; GCN-NEXT: s_waitcnt vmcnt(0) 1579; GCN-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1580; GCN-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1581; GCN-NEXT: v_mov_b32_e32 v0, s0 1582; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1583; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 1584; GCN-NEXT: v_trunc_f32_e32 v4, v4 1585; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1586; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 1587; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 1588; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1589; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 1590; GCN-NEXT: flat_store_dword v[0:1], v2 1591; GCN-NEXT: s_endpgm 1592; 1593; GFX1030-LABEL: v_udiv_i16: 1594; GFX1030: ; %bb.0: 1595; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1596; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1597; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX1030-NEXT: global_load_dword v1, v0, s[2:3] 1599; GFX1030-NEXT: s_waitcnt vmcnt(0) 1600; GFX1030-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1601; GFX1030-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 1602; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v2 1603; GFX1030-NEXT: v_mul_f32_e32 v3, v1, v3 1604; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1605; GFX1030-NEXT: v_fma_f32 v1, -v3, v2, v1 1606; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1607; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v1|, v2 1608; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1609; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v1 1610; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1611; GFX1030-NEXT: s_endpgm 1612; 1613; EG-LABEL: v_udiv_i16: 1614; EG: ; %bb.0: 1615; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1616; EG-NEXT: TEX 1 @6 1617; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] 1618; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1619; EG-NEXT: CF_END 1620; EG-NEXT: PAD 1621; EG-NEXT: Fetch clause starting at 6: 1622; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1623; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1624; EG-NEXT: ALU clause starting at 10: 1625; EG-NEXT: MOV * T0.X, KC0[2].Z, 1626; EG-NEXT: ALU clause starting at 11: 1627; EG-NEXT: UINT_TO_FLT * T0.Y, T1.X, 1628; EG-NEXT: RECIP_IEEE * T0.Z, PS, 1629; EG-NEXT: UINT_TO_FLT * T0.X, T0.X, 1630; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Z, 1631; EG-NEXT: TRUNC * T0.W, PV.W, 1632; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X, 1633; EG-NEXT: TRUNC * T0.W, PV.W, 1634; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|, 1635; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1636; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1637; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1638; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1639; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1640; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1641; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 1642 %den_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 1643 %num = load i16, ptr addrspace(1) %in 1644 %den = load i16, ptr addrspace(1) %den_ptr 1645 %result = udiv i16 %num, %den 1646 %result.ext = zext i16 %result to i32 1647 store i32 %result.ext, ptr addrspace(1) %out 1648 ret void 1649} 1650 1651define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1652; SI-LABEL: v_udiv_i23: 1653; SI: ; %bb.0: 1654; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1655; SI-NEXT: s_mov_b32 s7, 0xf000 1656; SI-NEXT: s_mov_b32 s6, -1 1657; SI-NEXT: s_mov_b32 s10, s6 1658; SI-NEXT: s_mov_b32 s11, s7 1659; SI-NEXT: s_waitcnt lgkmcnt(0) 1660; SI-NEXT: s_mov_b32 s8, s2 1661; SI-NEXT: s_mov_b32 s9, s3 1662; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1663; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1664; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1665; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1666; SI-NEXT: s_mov_b32 s4, s0 1667; SI-NEXT: s_mov_b32 s5, s1 1668; SI-NEXT: s_waitcnt vmcnt(3) 1669; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1670; SI-NEXT: s_waitcnt vmcnt(2) 1671; SI-NEXT: v_or_b32_e32 v0, v1, v0 1672; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1673; SI-NEXT: s_waitcnt vmcnt(1) 1674; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1675; SI-NEXT: s_waitcnt vmcnt(0) 1676; SI-NEXT: v_or_b32_e32 v1, v3, v1 1677; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 1678; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1679; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1680; SI-NEXT: v_trunc_f32_e32 v2, v2 1681; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1682; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1683; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1684; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1685; SI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 1686; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1687; SI-NEXT: s_endpgm 1688; 1689; VI-LABEL: v_udiv_i23: 1690; VI: ; %bb.0: 1691; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1692; VI-NEXT: s_mov_b32 s7, 0xf000 1693; VI-NEXT: s_mov_b32 s6, -1 1694; VI-NEXT: s_mov_b32 s10, s6 1695; VI-NEXT: s_mov_b32 s11, s7 1696; VI-NEXT: s_waitcnt lgkmcnt(0) 1697; VI-NEXT: s_mov_b32 s8, s2 1698; VI-NEXT: s_mov_b32 s9, s3 1699; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1700; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1701; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1702; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1703; VI-NEXT: s_mov_b32 s4, s0 1704; VI-NEXT: s_mov_b32 s5, s1 1705; VI-NEXT: s_waitcnt vmcnt(3) 1706; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1707; VI-NEXT: s_waitcnt vmcnt(2) 1708; VI-NEXT: v_or_b32_e32 v0, v1, v0 1709; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1710; VI-NEXT: s_waitcnt vmcnt(1) 1711; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1712; VI-NEXT: s_waitcnt vmcnt(0) 1713; VI-NEXT: v_or_b32_e32 v1, v3, v1 1714; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 1715; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1716; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1717; VI-NEXT: v_trunc_f32_e32 v2, v2 1718; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1719; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1720; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1721; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1722; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 1723; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1724; VI-NEXT: s_endpgm 1725; 1726; GCN-LABEL: v_udiv_i23: 1727; GCN: ; %bb.0: 1728; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1729; GCN-NEXT: s_waitcnt lgkmcnt(0) 1730; GCN-NEXT: s_add_u32 s4, s2, 4 1731; GCN-NEXT: s_addc_u32 s5, s3, 0 1732; GCN-NEXT: s_add_u32 s6, s2, 2 1733; GCN-NEXT: s_addc_u32 s7, s3, 0 1734; GCN-NEXT: v_mov_b32_e32 v0, s6 1735; GCN-NEXT: v_mov_b32_e32 v1, s7 1736; GCN-NEXT: s_add_u32 s6, s2, 6 1737; GCN-NEXT: s_addc_u32 s7, s3, 0 1738; GCN-NEXT: v_mov_b32_e32 v2, s6 1739; GCN-NEXT: v_mov_b32_e32 v3, s7 1740; GCN-NEXT: v_mov_b32_e32 v4, s4 1741; GCN-NEXT: v_mov_b32_e32 v5, s5 1742; GCN-NEXT: flat_load_ubyte v6, v[2:3] 1743; GCN-NEXT: flat_load_ushort v4, v[4:5] 1744; GCN-NEXT: v_mov_b32_e32 v2, s2 1745; GCN-NEXT: v_mov_b32_e32 v3, s3 1746; GCN-NEXT: flat_load_ubyte v0, v[0:1] 1747; GCN-NEXT: flat_load_ushort v1, v[2:3] 1748; GCN-NEXT: s_waitcnt vmcnt(3) 1749; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 1750; GCN-NEXT: s_waitcnt vmcnt(2) 1751; GCN-NEXT: v_or_b32_e32 v2, v4, v2 1752; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 1753; GCN-NEXT: s_waitcnt vmcnt(1) 1754; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1755; GCN-NEXT: s_waitcnt vmcnt(0) 1756; GCN-NEXT: v_or_b32_e32 v0, v1, v0 1757; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 1758; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1759; GCN-NEXT: v_mov_b32_e32 v0, s0 1760; GCN-NEXT: v_mov_b32_e32 v1, s1 1761; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 1762; GCN-NEXT: v_trunc_f32_e32 v4, v4 1763; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1764; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 1765; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 1766; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1767; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 1768; GCN-NEXT: flat_store_dword v[0:1], v2 1769; GCN-NEXT: s_endpgm 1770; 1771; GFX1030-LABEL: v_udiv_i23: 1772; GFX1030: ; %bb.0: 1773; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1774; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1775; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1776; GFX1030-NEXT: s_clause 0x3 1777; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 1778; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 1779; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 1780; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3] 1781; GFX1030-NEXT: s_waitcnt vmcnt(3) 1782; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1783; GFX1030-NEXT: s_waitcnt vmcnt(2) 1784; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 1785; GFX1030-NEXT: s_waitcnt vmcnt(1) 1786; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1787; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 1788; GFX1030-NEXT: s_waitcnt vmcnt(0) 1789; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2 1790; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 1791; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 1792; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 1793; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1794; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2 1795; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1796; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1 1797; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1798; GFX1030-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 1799; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1800; GFX1030-NEXT: s_endpgm 1801; 1802; EG-LABEL: v_udiv_i23: 1803; EG: ; %bb.0: 1804; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1805; EG-NEXT: TEX 3 @6 1806; EG-NEXT: ALU 20, @15, KC0[CB0:0-32], KC1[] 1807; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1808; EG-NEXT: CF_END 1809; EG-NEXT: PAD 1810; EG-NEXT: Fetch clause starting at 6: 1811; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 1812; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 1813; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 1814; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1815; EG-NEXT: ALU clause starting at 14: 1816; EG-NEXT: MOV * T0.X, KC0[2].Z, 1817; EG-NEXT: ALU clause starting at 15: 1818; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1819; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1820; EG-NEXT: OR_INT T0.W, T0.X, PV.W, 1821; EG-NEXT: LSHL * T1.W, T3.X, literal.x, 1822; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1823; EG-NEXT: UINT_TO_FLT * T0.X, PV.W, 1824; EG-NEXT: OR_INT T0.W, T2.X, T1.W, 1825; EG-NEXT: RECIP_IEEE * T0.Y, PS, 1826; EG-NEXT: UINT_TO_FLT * T0.Z, PV.W, 1827; EG-NEXT: MUL_IEEE * T0.W, PS, T0.Y, 1828; EG-NEXT: TRUNC * T0.W, PV.W, 1829; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z, 1830; EG-NEXT: TRUNC * T0.W, PV.W, 1831; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.X|, 1832; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x, 1833; EG-NEXT: FLT_TO_UINT * T0.X, T0.W, 1834; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00) 1835; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 1836; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 1837; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1838; EG-NEXT: 8388607(1.175494e-38), 2(2.802597e-45) 1839 %den_ptr = getelementptr i23, ptr addrspace(1) %in, i23 1 1840 %num = load i23, ptr addrspace(1) %in 1841 %den = load i23, ptr addrspace(1) %den_ptr 1842 %result = udiv i23 %num, %den 1843 %result.ext = zext i23 %result to i32 1844 store i32 %result.ext, ptr addrspace(1) %out 1845 ret void 1846} 1847 1848define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1849; SI-LABEL: v_udiv_i24: 1850; SI: ; %bb.0: 1851; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1852; SI-NEXT: s_mov_b32 s7, 0xf000 1853; SI-NEXT: s_mov_b32 s6, -1 1854; SI-NEXT: s_mov_b32 s10, s6 1855; SI-NEXT: s_mov_b32 s11, s7 1856; SI-NEXT: s_waitcnt lgkmcnt(0) 1857; SI-NEXT: s_mov_b32 s8, s2 1858; SI-NEXT: s_mov_b32 s9, s3 1859; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1860; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1861; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1862; SI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1863; SI-NEXT: s_mov_b32 s4, s0 1864; SI-NEXT: s_mov_b32 s5, s1 1865; SI-NEXT: s_waitcnt vmcnt(3) 1866; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1867; SI-NEXT: s_waitcnt vmcnt(2) 1868; SI-NEXT: v_or_b32_e32 v0, v1, v0 1869; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 1870; SI-NEXT: s_waitcnt vmcnt(1) 1871; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1872; SI-NEXT: s_waitcnt vmcnt(0) 1873; SI-NEXT: v_or_b32_e32 v1, v3, v1 1874; SI-NEXT: v_cvt_f32_u32_e32 v1, v1 1875; SI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1876; SI-NEXT: v_mul_f32_e32 v2, v1, v2 1877; SI-NEXT: v_trunc_f32_e32 v2, v2 1878; SI-NEXT: v_cvt_u32_f32_e32 v3, v2 1879; SI-NEXT: v_mad_f32 v1, -v2, v0, v1 1880; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1881; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1882; SI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 1883; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1884; SI-NEXT: s_endpgm 1885; 1886; VI-LABEL: v_udiv_i24: 1887; VI: ; %bb.0: 1888; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1889; VI-NEXT: s_mov_b32 s7, 0xf000 1890; VI-NEXT: s_mov_b32 s6, -1 1891; VI-NEXT: s_mov_b32 s10, s6 1892; VI-NEXT: s_mov_b32 s11, s7 1893; VI-NEXT: s_waitcnt lgkmcnt(0) 1894; VI-NEXT: s_mov_b32 s8, s2 1895; VI-NEXT: s_mov_b32 s9, s3 1896; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6 1897; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4 1898; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 1899; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0 1900; VI-NEXT: s_mov_b32 s4, s0 1901; VI-NEXT: s_mov_b32 s5, s1 1902; VI-NEXT: s_waitcnt vmcnt(3) 1903; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1904; VI-NEXT: s_waitcnt vmcnt(2) 1905; VI-NEXT: v_or_b32_e32 v0, v1, v0 1906; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1907; VI-NEXT: s_waitcnt vmcnt(1) 1908; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 1909; VI-NEXT: s_waitcnt vmcnt(0) 1910; VI-NEXT: v_or_b32_e32 v1, v3, v1 1911; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 1912; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0 1913; VI-NEXT: v_mul_f32_e32 v2, v1, v2 1914; VI-NEXT: v_trunc_f32_e32 v2, v2 1915; VI-NEXT: v_cvt_u32_f32_e32 v3, v2 1916; VI-NEXT: v_mad_f32 v1, -v2, v0, v1 1917; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1918; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1919; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0 1920; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1921; VI-NEXT: s_endpgm 1922; 1923; GCN-LABEL: v_udiv_i24: 1924; GCN: ; %bb.0: 1925; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1926; GCN-NEXT: s_waitcnt lgkmcnt(0) 1927; GCN-NEXT: s_add_u32 s4, s2, 4 1928; GCN-NEXT: s_addc_u32 s5, s3, 0 1929; GCN-NEXT: s_add_u32 s6, s2, 2 1930; GCN-NEXT: s_addc_u32 s7, s3, 0 1931; GCN-NEXT: v_mov_b32_e32 v0, s6 1932; GCN-NEXT: v_mov_b32_e32 v1, s7 1933; GCN-NEXT: s_add_u32 s6, s2, 6 1934; GCN-NEXT: s_addc_u32 s7, s3, 0 1935; GCN-NEXT: v_mov_b32_e32 v2, s6 1936; GCN-NEXT: v_mov_b32_e32 v3, s7 1937; GCN-NEXT: v_mov_b32_e32 v4, s4 1938; GCN-NEXT: v_mov_b32_e32 v5, s5 1939; GCN-NEXT: flat_load_ubyte v6, v[2:3] 1940; GCN-NEXT: flat_load_ushort v4, v[4:5] 1941; GCN-NEXT: v_mov_b32_e32 v2, s2 1942; GCN-NEXT: v_mov_b32_e32 v3, s3 1943; GCN-NEXT: flat_load_ubyte v0, v[0:1] 1944; GCN-NEXT: flat_load_ushort v1, v[2:3] 1945; GCN-NEXT: s_waitcnt vmcnt(3) 1946; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 1947; GCN-NEXT: s_waitcnt vmcnt(2) 1948; GCN-NEXT: v_or_b32_e32 v2, v4, v2 1949; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 1950; GCN-NEXT: s_waitcnt vmcnt(1) 1951; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1952; GCN-NEXT: s_waitcnt vmcnt(0) 1953; GCN-NEXT: v_or_b32_e32 v0, v1, v0 1954; GCN-NEXT: v_cvt_f32_u32_e32 v3, v0 1955; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 1956; GCN-NEXT: v_mov_b32_e32 v0, s0 1957; GCN-NEXT: v_mov_b32_e32 v1, s1 1958; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 1959; GCN-NEXT: v_trunc_f32_e32 v4, v4 1960; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1961; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 1962; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 1963; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1964; GCN-NEXT: v_and_b32_e32 v2, 0xffffff, v2 1965; GCN-NEXT: flat_store_dword v[0:1], v2 1966; GCN-NEXT: s_endpgm 1967; 1968; GFX1030-LABEL: v_udiv_i24: 1969; GFX1030: ; %bb.0: 1970; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 1971; GFX1030-NEXT: v_mov_b32_e32 v0, 0 1972; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 1973; GFX1030-NEXT: s_clause 0x3 1974; GFX1030-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 1975; GFX1030-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 1976; GFX1030-NEXT: global_load_ubyte v3, v0, s[2:3] offset:2 1977; GFX1030-NEXT: global_load_ushort v4, v0, s[2:3] 1978; GFX1030-NEXT: s_waitcnt vmcnt(3) 1979; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1980; GFX1030-NEXT: s_waitcnt vmcnt(2) 1981; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 1982; GFX1030-NEXT: s_waitcnt vmcnt(1) 1983; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1984; GFX1030-NEXT: v_cvt_f32_u32_e32 v1, v1 1985; GFX1030-NEXT: s_waitcnt vmcnt(0) 1986; GFX1030-NEXT: v_or_b32_e32 v2, v4, v2 1987; GFX1030-NEXT: v_rcp_iflag_f32_e32 v3, v1 1988; GFX1030-NEXT: v_cvt_f32_u32_e32 v2, v2 1989; GFX1030-NEXT: v_mul_f32_e32 v3, v2, v3 1990; GFX1030-NEXT: v_trunc_f32_e32 v3, v3 1991; GFX1030-NEXT: v_fma_f32 v2, -v3, v1, v2 1992; GFX1030-NEXT: v_cvt_u32_f32_e32 v3, v3 1993; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v1 1994; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo 1995; GFX1030-NEXT: v_and_b32_e32 v1, 0xffffff, v1 1996; GFX1030-NEXT: global_store_dword v0, v1, s[0:1] 1997; GFX1030-NEXT: s_endpgm 1998; 1999; EG-LABEL: v_udiv_i24: 2000; EG: ; %bb.0: 2001; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 2002; EG-NEXT: TEX 3 @6 2003; EG-NEXT: ALU 23, @15, KC0[CB0:0-32], KC1[] 2004; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2005; EG-NEXT: CF_END 2006; EG-NEXT: PAD 2007; EG-NEXT: Fetch clause starting at 6: 2008; EG-NEXT: VTX_READ_8 T1.X, T0.X, 6, #1 2009; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 2010; EG-NEXT: VTX_READ_8 T3.X, T0.X, 2, #1 2011; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 2012; EG-NEXT: ALU clause starting at 14: 2013; EG-NEXT: MOV * T0.X, KC0[2].Z, 2014; EG-NEXT: ALU clause starting at 15: 2015; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 2016; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2017; EG-NEXT: OR_INT * T0.W, T0.X, PV.W, 2018; EG-NEXT: SUB_INT T1.W, 0.0, PV.W, 2019; EG-NEXT: RECIP_UINT * T0.X, PV.W, 2020; EG-NEXT: MULLO_INT * T0.Y, PV.W, PS, 2021; EG-NEXT: LSHL T1.W, T3.X, literal.x, 2022; EG-NEXT: MULHI * T0.Y, T0.X, PS, 2023; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 2024; EG-NEXT: ADD_INT T2.W, T0.X, PS, 2025; EG-NEXT: OR_INT * T1.W, T2.X, PV.W, 2026; EG-NEXT: MULHI * T0.X, PS, PV.W, 2027; EG-NEXT: MULLO_INT * T0.Y, PS, T0.W, 2028; EG-NEXT: SUB_INT * T1.W, T1.W, PS, 2029; EG-NEXT: ADD_INT T0.Z, T0.X, 1, 2030; EG-NEXT: SETGE_UINT T2.W, PV.W, T0.W, 2031; EG-NEXT: SUB_INT * T3.W, PV.W, T0.W, 2032; EG-NEXT: CNDE_INT T1.W, PV.W, T1.W, PS, 2033; EG-NEXT: CNDE_INT * T2.W, PV.W, T0.X, PV.Z, 2034; EG-NEXT: ADD_INT T3.W, PS, 1, 2035; EG-NEXT: SETGE_UINT * T0.W, PV.W, T0.W, 2036; EG-NEXT: CNDE_INT T0.X, PS, T2.W, PV.W, 2037; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2038; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2039 %den_ptr = getelementptr i24, ptr addrspace(1) %in, i24 1 2040 %num = load i24, ptr addrspace(1) %in 2041 %den = load i24, ptr addrspace(1) %den_ptr 2042 %result = udiv i24 %num, %den 2043 %result.ext = zext i24 %result to i32 2044 store i32 %result.ext, ptr addrspace(1) %out 2045 ret void 2046} 2047 2048define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture readonly %in, ptr addrspace(1) nocapture %out) { 2049; SI-LABEL: scalarize_mulhu_4xi32: 2050; SI: ; %bb.0: 2051; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2052; SI-NEXT: s_mov_b32 s7, 0xf000 2053; SI-NEXT: s_mov_b32 s6, -1 2054; SI-NEXT: s_waitcnt lgkmcnt(0) 2055; SI-NEXT: s_mov_b32 s4, s0 2056; SI-NEXT: s_mov_b32 s5, s1 2057; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2058; SI-NEXT: s_mov_b32 s0, 0x1389c755 2059; SI-NEXT: s_mov_b32 s4, s2 2060; SI-NEXT: s_mov_b32 s5, s3 2061; SI-NEXT: s_waitcnt vmcnt(0) 2062; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2063; SI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2064; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2065; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2066; SI-NEXT: v_mul_hi_u32 v0, v0, s0 2067; SI-NEXT: v_mul_hi_u32 v1, v1, s0 2068; SI-NEXT: v_mul_hi_u32 v2, v2, s0 2069; SI-NEXT: v_mul_hi_u32 v3, v3, s0 2070; SI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2071; SI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2072; SI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2073; SI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2074; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2075; SI-NEXT: s_endpgm 2076; 2077; VI-LABEL: scalarize_mulhu_4xi32: 2078; VI: ; %bb.0: 2079; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2080; VI-NEXT: s_mov_b32 s7, 0xf000 2081; VI-NEXT: s_mov_b32 s6, -1 2082; VI-NEXT: s_waitcnt lgkmcnt(0) 2083; VI-NEXT: s_mov_b32 s4, s0 2084; VI-NEXT: s_mov_b32 s5, s1 2085; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 2086; VI-NEXT: s_mov_b32 s0, 0x1389c755 2087; VI-NEXT: s_mov_b32 s4, s2 2088; VI-NEXT: s_mov_b32 s5, s3 2089; VI-NEXT: s_waitcnt vmcnt(0) 2090; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2091; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2092; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2093; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2094; VI-NEXT: v_mul_hi_u32 v0, v0, s0 2095; VI-NEXT: v_mul_hi_u32 v1, v1, s0 2096; VI-NEXT: v_mul_hi_u32 v2, v2, s0 2097; VI-NEXT: v_mul_hi_u32 v3, v3, s0 2098; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2099; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2100; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2101; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2102; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 2103; VI-NEXT: s_endpgm 2104; 2105; GCN-LABEL: scalarize_mulhu_4xi32: 2106; GCN: ; %bb.0: 2107; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2108; GCN-NEXT: s_waitcnt lgkmcnt(0) 2109; GCN-NEXT: v_mov_b32_e32 v0, s0 2110; GCN-NEXT: v_mov_b32_e32 v1, s1 2111; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 2112; GCN-NEXT: s_mov_b32 s0, 0x1389c755 2113; GCN-NEXT: v_mov_b32_e32 v4, s2 2114; GCN-NEXT: v_mov_b32_e32 v5, s3 2115; GCN-NEXT: s_waitcnt vmcnt(0) 2116; GCN-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2117; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2118; GCN-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2119; GCN-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2120; GCN-NEXT: v_mul_hi_u32 v0, v0, s0 2121; GCN-NEXT: v_mul_hi_u32 v1, v1, s0 2122; GCN-NEXT: v_mul_hi_u32 v2, v2, s0 2123; GCN-NEXT: v_mul_hi_u32 v3, v3, s0 2124; GCN-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2125; GCN-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2126; GCN-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2127; GCN-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2128; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2129; GCN-NEXT: s_endpgm 2130; 2131; GFX1030-LABEL: scalarize_mulhu_4xi32: 2132; GFX1030: ; %bb.0: 2133; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 2134; GFX1030-NEXT: v_mov_b32_e32 v4, 0 2135; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2136; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] 2137; GFX1030-NEXT: s_waitcnt vmcnt(0) 2138; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0 2139; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1 2140; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2 2141; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 2142; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0 2143; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1 2144; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2 2145; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3 2146; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0 2147; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1 2148; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2 2149; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 10, v3 2150; GFX1030-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 2151; GFX1030-NEXT: s_endpgm 2152; 2153; EG-LABEL: scalarize_mulhu_4xi32: 2154; EG: ; %bb.0: 2155; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 2156; EG-NEXT: TEX 0 @6 2157; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] 2158; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 2159; EG-NEXT: CF_END 2160; EG-NEXT: PAD 2161; EG-NEXT: Fetch clause starting at 6: 2162; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 2163; EG-NEXT: ALU clause starting at 8: 2164; EG-NEXT: MOV * T0.X, KC0[2].Y, 2165; EG-NEXT: ALU clause starting at 9: 2166; EG-NEXT: LSHR T0.W, T0.W, literal.x, 2167; EG-NEXT: LSHR * T1.W, T0.Z, literal.x, 2168; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2169; EG-NEXT: MULHI * T0.Z, PV.W, literal.x, 2170; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2171; EG-NEXT: LSHR T1.Z, T0.Y, literal.x, 2172; EG-NEXT: LSHR T0.W, PS, literal.y, 2173; EG-NEXT: MULHI * T0.Y, T1.W, literal.z, 2174; EG-NEXT: 2(2.802597e-45), 10(1.401298e-44) 2175; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2176; EG-NEXT: LSHR T0.Z, PS, literal.x, 2177; EG-NEXT: LSHR T1.W, T0.X, literal.y, 2178; EG-NEXT: MULHI * T0.X, PV.Z, literal.z, 2179; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45) 2180; EG-NEXT: 327796565(3.478022e-27), 0(0.000000e+00) 2181; EG-NEXT: LSHR T0.Y, PS, literal.x, 2182; EG-NEXT: MULHI * T0.X, PV.W, literal.y, 2183; EG-NEXT: 10(1.401298e-44), 327796565(3.478022e-27) 2184; EG-NEXT: LSHR T0.X, PS, literal.x, 2185; EG-NEXT: LSHR * T1.X, KC0[2].Z, literal.y, 2186; EG-NEXT: 10(1.401298e-44), 2(2.802597e-45) 2187 %1 = load <4 x i32>, ptr addrspace(1) %in, align 16 2188 %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668> 2189 store <4 x i32> %2, ptr addrspace(1) %out, align 16 2190 ret void 2191} 2192 2193define amdgpu_kernel void @test_udiv2(i32 %p) { 2194; SI-LABEL: test_udiv2: 2195; SI: ; %bb.0: 2196; SI-NEXT: s_load_dword s0, s[4:5], 0x9 2197; SI-NEXT: s_mov_b32 s3, 0xf000 2198; SI-NEXT: s_mov_b32 s2, -1 2199; SI-NEXT: s_waitcnt lgkmcnt(0) 2200; SI-NEXT: s_lshr_b32 s0, s0, 1 2201; SI-NEXT: v_mov_b32_e32 v0, s0 2202; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2203; SI-NEXT: s_waitcnt vmcnt(0) 2204; SI-NEXT: s_endpgm 2205; 2206; VI-LABEL: test_udiv2: 2207; VI: ; %bb.0: 2208; VI-NEXT: s_load_dword s0, s[4:5], 0x24 2209; VI-NEXT: s_mov_b32 s3, 0xf000 2210; VI-NEXT: s_mov_b32 s2, -1 2211; VI-NEXT: s_waitcnt lgkmcnt(0) 2212; VI-NEXT: s_lshr_b32 s0, s0, 1 2213; VI-NEXT: v_mov_b32_e32 v0, s0 2214; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2215; VI-NEXT: s_waitcnt vmcnt(0) 2216; VI-NEXT: s_endpgm 2217; 2218; GCN-LABEL: test_udiv2: 2219; GCN: ; %bb.0: 2220; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 2221; GCN-NEXT: s_waitcnt lgkmcnt(0) 2222; GCN-NEXT: s_lshr_b32 s0, s0, 1 2223; GCN-NEXT: v_mov_b32_e32 v0, s0 2224; GCN-NEXT: flat_store_dword v[0:1], v0 2225; GCN-NEXT: s_waitcnt vmcnt(0) 2226; GCN-NEXT: s_endpgm 2227; 2228; GFX1030-LABEL: test_udiv2: 2229; GFX1030: ; %bb.0: 2230; GFX1030-NEXT: s_load_dword s0, s[8:9], 0x0 2231; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2232; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 2233; GFX1030-NEXT: v_mov_b32_e32 v0, s0 2234; GFX1030-NEXT: global_store_dword v[0:1], v0, off 2235; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 2236; GFX1030-NEXT: s_endpgm 2237; 2238; EG-LABEL: test_udiv2: 2239; EG: ; %bb.0: 2240; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2241; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2242; EG-NEXT: CF_END 2243; EG-NEXT: PAD 2244; EG-NEXT: ALU clause starting at 4: 2245; EG-NEXT: MOV T0.X, literal.x, 2246; EG-NEXT: LSHR * T1.X, KC0[2].Y, 1, 2247; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2248 %i = udiv i32 %p, 2 2249 store volatile i32 %i, ptr addrspace(1) undef 2250 ret void 2251} 2252 2253define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { 2254; SI-LABEL: test_udiv_3_mulhu: 2255; SI: ; %bb.0: 2256; SI-NEXT: s_load_dword s0, s[4:5], 0x9 2257; SI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2258; SI-NEXT: s_mov_b32 s3, 0xf000 2259; SI-NEXT: s_mov_b32 s2, -1 2260; SI-NEXT: s_waitcnt lgkmcnt(0) 2261; SI-NEXT: v_mul_hi_u32 v0, s0, v0 2262; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2263; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2264; SI-NEXT: s_waitcnt vmcnt(0) 2265; SI-NEXT: s_endpgm 2266; 2267; VI-LABEL: test_udiv_3_mulhu: 2268; VI: ; %bb.0: 2269; VI-NEXT: s_load_dword s0, s[4:5], 0x24 2270; VI-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2271; VI-NEXT: s_mov_b32 s3, 0xf000 2272; VI-NEXT: s_mov_b32 s2, -1 2273; VI-NEXT: s_waitcnt lgkmcnt(0) 2274; VI-NEXT: v_mul_hi_u32 v0, s0, v0 2275; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2276; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2277; VI-NEXT: s_waitcnt vmcnt(0) 2278; VI-NEXT: s_endpgm 2279; 2280; GCN-LABEL: test_udiv_3_mulhu: 2281; GCN: ; %bb.0: 2282; GCN-NEXT: s_load_dword s0, s[8:9], 0x0 2283; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab 2284; GCN-NEXT: s_waitcnt lgkmcnt(0) 2285; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 2286; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 2287; GCN-NEXT: flat_store_dword v[0:1], v0 2288; GCN-NEXT: s_waitcnt vmcnt(0) 2289; GCN-NEXT: s_endpgm 2290; 2291; GFX1030-LABEL: test_udiv_3_mulhu: 2292; GFX1030: ; %bb.0: 2293; GFX1030-NEXT: s_load_dword s0, s[8:9], 0x0 2294; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX1030-NEXT: s_mul_hi_u32 s0, s0, 0xaaaaaaab 2296; GFX1030-NEXT: s_lshr_b32 s0, s0, 1 2297; GFX1030-NEXT: v_mov_b32_e32 v0, s0 2298; GFX1030-NEXT: global_store_dword v[0:1], v0, off 2299; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 2300; GFX1030-NEXT: s_endpgm 2301; 2302; EG-LABEL: test_udiv_3_mulhu: 2303; EG: ; %bb.0: 2304; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 2305; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2306; EG-NEXT: CF_END 2307; EG-NEXT: PAD 2308; EG-NEXT: ALU clause starting at 4: 2309; EG-NEXT: MULHI * T0.X, KC0[2].Y, literal.x, 2310; EG-NEXT: -1431655765(-3.031649e-13), 0(0.000000e+00) 2311; EG-NEXT: LSHR T0.X, PS, 1, 2312; EG-NEXT: MOV * T1.X, literal.x, 2313; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2314 %i = udiv i32 %p, 3 2315 store volatile i32 %i, ptr addrspace(1) undef 2316 ret void 2317} 2318 2319define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readonly %arg) { 2320; SI-LABEL: fdiv_test_denormals: 2321; SI: ; %bb.0: ; %bb 2322; SI-NEXT: s_mov_b32 s0, 0 2323; SI-NEXT: s_mov_b32 s3, 0xf000 2324; SI-NEXT: s_mov_b32 s2, -1 2325; SI-NEXT: s_mov_b32 s1, s0 2326; SI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 2327; SI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 2328; SI-NEXT: s_waitcnt vmcnt(1) 2329; SI-NEXT: v_cvt_f32_i32_e32 v2, v0 2330; SI-NEXT: s_waitcnt vmcnt(0) 2331; SI-NEXT: v_cvt_f32_i32_e32 v3, v1 2332; SI-NEXT: v_xor_b32_e32 v0, v1, v0 2333; SI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 2334; SI-NEXT: v_rcp_iflag_f32_e32 v4, v2 2335; SI-NEXT: v_or_b32_e32 v0, 1, v0 2336; SI-NEXT: v_mul_f32_e32 v1, v3, v4 2337; SI-NEXT: v_trunc_f32_e32 v1, v1 2338; SI-NEXT: v_mad_f32 v3, -v1, v2, v3 2339; SI-NEXT: v_cvt_i32_f32_e32 v1, v1 2340; SI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 2341; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 2342; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2343; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2344; SI-NEXT: s_endpgm 2345; 2346; VI-LABEL: fdiv_test_denormals: 2347; VI: ; %bb.0: ; %bb 2348; VI-NEXT: s_mov_b32 s0, 0 2349; VI-NEXT: s_mov_b32 s3, 0xf000 2350; VI-NEXT: s_mov_b32 s2, -1 2351; VI-NEXT: s_mov_b32 s1, s0 2352; VI-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 2353; VI-NEXT: buffer_load_sbyte v1, off, s[0:3], 0 2354; VI-NEXT: s_waitcnt vmcnt(1) 2355; VI-NEXT: v_cvt_f32_i32_e32 v2, v0 2356; VI-NEXT: s_waitcnt vmcnt(0) 2357; VI-NEXT: v_cvt_f32_i32_e32 v3, v1 2358; VI-NEXT: v_xor_b32_e32 v0, v1, v0 2359; VI-NEXT: v_ashrrev_i32_e32 v0, 30, v0 2360; VI-NEXT: v_rcp_iflag_f32_e32 v4, v2 2361; VI-NEXT: v_or_b32_e32 v0, 1, v0 2362; VI-NEXT: v_mul_f32_e32 v1, v3, v4 2363; VI-NEXT: v_trunc_f32_e32 v1, v1 2364; VI-NEXT: v_mad_f32 v3, -v1, v2, v3 2365; VI-NEXT: v_cvt_i32_f32_e32 v1, v1 2366; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 2367; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 2368; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 2369; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 2370; VI-NEXT: s_endpgm 2371; 2372; GCN-LABEL: fdiv_test_denormals: 2373; GCN: ; %bb.0: ; %bb 2374; GCN-NEXT: flat_load_sbyte v2, v[0:1] 2375; GCN-NEXT: v_mov_b32_e32 v0, 0 2376; GCN-NEXT: v_mov_b32_e32 v1, 0 2377; GCN-NEXT: flat_load_sbyte v3, v[0:1] 2378; GCN-NEXT: s_waitcnt vmcnt(1) 2379; GCN-NEXT: v_cvt_f32_i32_e32 v4, v2 2380; GCN-NEXT: s_waitcnt vmcnt(0) 2381; GCN-NEXT: v_cvt_f32_i32_e32 v5, v3 2382; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2383; GCN-NEXT: v_xor_b32_e32 v2, v3, v2 2384; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2 2385; GCN-NEXT: v_or_b32_e32 v2, 1, v2 2386; GCN-NEXT: v_mul_f32_e32 v3, v5, v6 2387; GCN-NEXT: v_trunc_f32_e32 v3, v3 2388; GCN-NEXT: v_mad_f32 v5, -v3, v4, v5 2389; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2390; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 2391; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 2392; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v3 2393; GCN-NEXT: flat_store_byte v[0:1], v2 2394; GCN-NEXT: s_endpgm 2395; 2396; GFX1030-LABEL: fdiv_test_denormals: 2397; GFX1030: ; %bb.0: ; %bb 2398; GFX1030-NEXT: global_load_sbyte v2, v[0:1], off 2399; GFX1030-NEXT: v_mov_b32_e32 v0, 0 2400; GFX1030-NEXT: v_mov_b32_e32 v1, 0 2401; GFX1030-NEXT: global_load_sbyte v3, v[0:1], off 2402; GFX1030-NEXT: s_waitcnt vmcnt(1) 2403; GFX1030-NEXT: v_cvt_f32_i32_e32 v4, v2 2404; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v4 2405; GFX1030-NEXT: s_waitcnt vmcnt(0) 2406; GFX1030-NEXT: v_cvt_f32_i32_e32 v6, v3 2407; GFX1030-NEXT: v_xor_b32_e32 v2, v3, v2 2408; GFX1030-NEXT: v_ashrrev_i32_e32 v2, 30, v2 2409; GFX1030-NEXT: v_mul_f32_e32 v5, v6, v5 2410; GFX1030-NEXT: v_or_b32_e32 v2, 1, v2 2411; GFX1030-NEXT: v_trunc_f32_e32 v3, v5 2412; GFX1030-NEXT: v_fma_f32 v5, -v3, v4, v6 2413; GFX1030-NEXT: v_cvt_i32_f32_e32 v3, v3 2414; GFX1030-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v4| 2415; GFX1030-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo 2416; GFX1030-NEXT: v_add_nc_u32_e32 v2, v3, v2 2417; GFX1030-NEXT: global_store_byte v[0:1], v2, off 2418; GFX1030-NEXT: s_endpgm 2419; 2420; EG-LABEL: fdiv_test_denormals: 2421; EG: ; %bb.0: ; %bb 2422; EG-NEXT: TEX 0 @6 2423; EG-NEXT: ALU 0, @10, KC0[], KC1[] 2424; EG-NEXT: TEX 0 @8 2425; EG-NEXT: ALU 25, @11, KC0[], KC1[] 2426; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 2427; EG-NEXT: CF_END 2428; EG-NEXT: Fetch clause starting at 6: 2429; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 2430; EG-NEXT: Fetch clause starting at 8: 2431; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1 2432; EG-NEXT: ALU clause starting at 10: 2433; EG-NEXT: MOV * T1.X, 0.0, 2434; EG-NEXT: ALU clause starting at 11: 2435; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, 2436; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2437; EG-NEXT: INT_TO_FLT * T0.X, PV.W, 2438; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, 2439; EG-NEXT: RECIP_IEEE * T0.Y, PS, 2440; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) 2441; EG-NEXT: INT_TO_FLT * T0.Z, PV.W, 2442; EG-NEXT: MUL_IEEE * T2.W, PS, T0.Y, 2443; EG-NEXT: TRUNC T2.W, PV.W, 2444; EG-NEXT: XOR_INT * T0.W, T1.W, T0.W, 2445; EG-NEXT: ASHR T0.W, PS, literal.x, 2446; EG-NEXT: MULADD_IEEE * T1.W, -PV.W, T0.X, T0.Z, 2447; EG-NEXT: 30(4.203895e-44), 0(0.000000e+00) 2448; EG-NEXT: TRUNC T0.Z, T2.W, 2449; EG-NEXT: SETGE T1.W, |PS|, |T0.X|, 2450; EG-NEXT: OR_INT * T0.W, PV.W, 1, 2451; EG-NEXT: CNDE T0.W, PV.W, 0.0, PS, 2452; EG-NEXT: FLT_TO_INT * T1.W, PV.Z, 2453; EG-NEXT: ADD_INT * T0.W, PS, PV.W, 2454; EG-NEXT: AND_INT T0.X, PV.W, literal.x, 2455; EG-NEXT: MOV * T0.W, literal.x, 2456; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2457; EG-NEXT: MOV T0.Y, 0.0, 2458; EG-NEXT: MOV * T0.Z, 0.0, 2459; EG-NEXT: MOV * T1.X, literal.x, 2460; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2461bb: 2462 %tmp = load i8, ptr addrspace(1) null, align 1 2463 %tmp1 = sext i8 %tmp to i32 2464 %tmp2 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 undef 2465 %tmp3 = load i8, ptr addrspace(1) %tmp2, align 1 2466 %tmp4 = sext i8 %tmp3 to i32 2467 %tmp5 = sdiv i32 %tmp1, %tmp4 2468 %tmp6 = trunc i32 %tmp5 to i8 2469 store i8 %tmp6, ptr addrspace(1) null, align 1 2470 ret void 2471} 2472 2473define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { 2474; SI-LABEL: v_test_udiv64_mulhi_fold: 2475; SI: ; %bb.0: 2476; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2477; SI-NEXT: v_alignbit_b32 v0, v1, v0, 5 2478; SI-NEXT: s_mov_b32 s4, 0x71b47843 2479; SI-NEXT: v_lshrrev_b32_e32 v1, 5, v1 2480; SI-NEXT: v_mul_hi_u32 v3, v0, s4 2481; SI-NEXT: v_mul_lo_u32 v4, v1, s4 2482; SI-NEXT: s_mov_b32 s6, 0xa7c5ac4 2483; SI-NEXT: v_mul_hi_u32 v5, v1, s4 2484; SI-NEXT: v_mul_hi_u32 v2, v0, s6 2485; SI-NEXT: v_mul_lo_u32 v0, v0, s6 2486; SI-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2487; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 2488; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v3 2489; SI-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2490; SI-NEXT: v_mul_lo_u32 v2, v1, s6 2491; SI-NEXT: v_mul_hi_u32 v1, v1, s6 2492; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 2493; SI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 2494; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2495; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 2496; SI-NEXT: v_alignbit_b32 v0, v1, v0, 7 2497; SI-NEXT: v_lshrrev_b32_e32 v1, 7, v1 2498; SI-NEXT: s_setpc_b64 s[30:31] 2499; 2500; VI-LABEL: v_test_udiv64_mulhi_fold: 2501; VI: ; %bb.0: 2502; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2503; VI-NEXT: v_alignbit_b32 v4, v1, v0, 5 2504; VI-NEXT: s_mov_b32 s4, 0x71b47843 2505; VI-NEXT: v_mul_hi_u32 v2, v4, s4 2506; VI-NEXT: v_mov_b32_e32 v3, 0 2507; VI-NEXT: v_lshrrev_b32_e32 v5, 5, v1 2508; VI-NEXT: s_mov_b32 s6, 0xa7c5ac4 2509; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3] 2510; VI-NEXT: v_mov_b32_e32 v2, v0 2511; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] 2512; VI-NEXT: v_mov_b32_e32 v0, v1 2513; VI-NEXT: v_mov_b32_e32 v1, v3 2514; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 2515; VI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc 2516; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] 2517; VI-NEXT: v_alignbit_b32 v0, v1, v0, 7 2518; VI-NEXT: v_lshrrev_b32_e32 v1, 7, v1 2519; VI-NEXT: s_setpc_b64 s[30:31] 2520; 2521; GCN-LABEL: v_test_udiv64_mulhi_fold: 2522; GCN: ; %bb.0: 2523; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2524; GCN-NEXT: v_alignbit_b32 v4, v1, v0, 5 2525; GCN-NEXT: s_mov_b32 s4, 0x71b47843 2526; GCN-NEXT: v_mul_hi_u32 v2, v4, s4 2527; GCN-NEXT: v_mov_b32_e32 v3, 0 2528; GCN-NEXT: v_lshrrev_b32_e32 v5, 5, v1 2529; GCN-NEXT: s_mov_b32 s6, 0xa7c5ac4 2530; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s4, v[2:3] 2531; GCN-NEXT: v_mov_b32_e32 v2, v0 2532; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, v[2:3] 2533; GCN-NEXT: v_mov_b32_e32 v0, v1 2534; GCN-NEXT: v_mov_b32_e32 v1, v3 2535; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 2536; GCN-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc 2537; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] 2538; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 7 2539; GCN-NEXT: v_lshrrev_b32_e32 v1, 7, v1 2540; GCN-NEXT: s_setpc_b64 s[30:31] 2541; 2542; GFX1030-LABEL: v_test_udiv64_mulhi_fold: 2543; GFX1030: ; %bb.0: 2544; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2545; GFX1030-NEXT: v_alignbit_b32 v4, v1, v0, 5 2546; GFX1030-NEXT: v_mov_b32_e32 v3, 0 2547; GFX1030-NEXT: v_lshrrev_b32_e32 v5, 5, v1 2548; GFX1030-NEXT: v_mul_hi_u32 v2, 0x71b47843, v4 2549; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0x71b47843, v5, v[2:3] 2550; GFX1030-NEXT: v_mov_b32_e32 v2, v0 2551; GFX1030-NEXT: v_mov_b32_e32 v0, v1 2552; GFX1030-NEXT: v_mad_u64_u32 v[2:3], null, 0xa7c5ac4, v4, v[2:3] 2553; GFX1030-NEXT: v_mov_b32_e32 v1, v3 2554; GFX1030-NEXT: v_add_co_u32 v0, s4, v0, v1 2555; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s4 2556; GFX1030-NEXT: v_mad_u64_u32 v[0:1], null, 0xa7c5ac4, v5, v[0:1] 2557; GFX1030-NEXT: v_alignbit_b32 v0, v1, v0, 7 2558; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 7, v1 2559; GFX1030-NEXT: s_setpc_b64 s[30:31] 2560; 2561; EG-LABEL: v_test_udiv64_mulhi_fold: 2562; EG: ; %bb.0: 2563; EG-NEXT: CF_END 2564; EG-NEXT: PAD 2565 %d = udiv i64 %arg, 100000 2566 ret i64 %d 2567} 2568