1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 5 6define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], ptr addrspace(1) [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GFX6-LABEL: udiv_i32: 41; GFX6: ; %bb.0: 42; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 43; GFX6-NEXT: s_mov_b32 s7, 0xf000 44; GFX6-NEXT: s_mov_b32 s6, -1 45; GFX6-NEXT: s_waitcnt lgkmcnt(0) 46; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GFX6-NEXT: s_sub_i32 s4, 0, s3 48; GFX6-NEXT: s_mov_b32 s5, s1 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 53; GFX6-NEXT: s_mov_b32 s4, s0 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 56; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 57; GFX6-NEXT: v_readfirstlane_b32 s0, v0 58; GFX6-NEXT: s_mul_i32 s0, s0, s3 59; GFX6-NEXT: s_sub_i32 s0, s2, s0 60; GFX6-NEXT: s_sub_i32 s1, s0, s3 61; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 62; GFX6-NEXT: s_cmp_ge_u32 s0, s3 63; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 64; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 65; GFX6-NEXT: s_cselect_b32 s0, s1, s0 66; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 67; GFX6-NEXT: s_cmp_ge_u32 s0, s3 68; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 69; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 70; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 71; GFX6-NEXT: s_endpgm 72; 73; GFX9-LABEL: udiv_i32: 74; GFX9: ; %bb.0: 75; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 76; GFX9-NEXT: v_mov_b32_e32 v1, 0 77; GFX9-NEXT: s_waitcnt lgkmcnt(0) 78; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 79; GFX9-NEXT: s_sub_i32 s4, 0, s3 80; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 82; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 83; GFX9-NEXT: v_readfirstlane_b32 s5, v0 84; GFX9-NEXT: s_mul_i32 s4, s4, s5 85; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 86; GFX9-NEXT: s_add_i32 s5, s5, s4 87; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 88; GFX9-NEXT: s_mul_i32 s5, s4, s3 89; GFX9-NEXT: s_sub_i32 s2, s2, s5 90; GFX9-NEXT: s_add_i32 s6, s4, 1 91; GFX9-NEXT: s_sub_i32 s5, s2, s3 92; GFX9-NEXT: s_cmp_ge_u32 s2, s3 93; GFX9-NEXT: s_cselect_b32 s4, s6, s4 94; GFX9-NEXT: s_cselect_b32 s2, s5, s2 95; GFX9-NEXT: s_add_i32 s5, s4, 1 96; GFX9-NEXT: s_cmp_ge_u32 s2, s3 97; GFX9-NEXT: s_cselect_b32 s2, s5, s4 98; GFX9-NEXT: v_mov_b32_e32 v0, s2 99; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 100; GFX9-NEXT: s_endpgm 101 %r = udiv i32 %x, %y 102 store i32 %r, ptr addrspace(1) %out 103 ret void 104} 105 106define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { 107; CHECK-LABEL: @urem_i32( 108; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 109; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 110; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 111; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 112; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 113; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 114; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 115; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 116; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 117; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 118; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 119; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 120; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 121; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 122; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 123; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 124; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 125; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 126; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 127; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 128; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 129; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 130; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 131; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 132; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 133; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 134; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 135; CHECK-NEXT: store i32 [[TMP27]], ptr addrspace(1) [[OUT:%.*]], align 4 136; CHECK-NEXT: ret void 137; 138; GFX6-LABEL: urem_i32: 139; GFX6: ; %bb.0: 140; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 141; GFX6-NEXT: s_mov_b32 s7, 0xf000 142; GFX6-NEXT: s_mov_b32 s6, -1 143; GFX6-NEXT: s_waitcnt lgkmcnt(0) 144; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 145; GFX6-NEXT: s_sub_i32 s4, 0, s3 146; GFX6-NEXT: s_mov_b32 s5, s1 147; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 148; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 149; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 150; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 151; GFX6-NEXT: s_mov_b32 s4, s0 152; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 153; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 154; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 155; GFX6-NEXT: v_readfirstlane_b32 s0, v0 156; GFX6-NEXT: s_mul_i32 s0, s0, s3 157; GFX6-NEXT: s_sub_i32 s0, s2, s0 158; GFX6-NEXT: s_sub_i32 s1, s0, s3 159; GFX6-NEXT: s_cmp_ge_u32 s0, s3 160; GFX6-NEXT: s_cselect_b32 s0, s1, s0 161; GFX6-NEXT: s_sub_i32 s1, s0, s3 162; GFX6-NEXT: s_cmp_ge_u32 s0, s3 163; GFX6-NEXT: s_cselect_b32 s0, s1, s0 164; GFX6-NEXT: v_mov_b32_e32 v0, s0 165; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 166; GFX6-NEXT: s_endpgm 167; 168; GFX9-LABEL: urem_i32: 169; GFX9: ; %bb.0: 170; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 171; GFX9-NEXT: v_mov_b32_e32 v1, 0 172; GFX9-NEXT: s_waitcnt lgkmcnt(0) 173; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 174; GFX9-NEXT: s_sub_i32 s4, 0, s3 175; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 176; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 177; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 178; GFX9-NEXT: v_readfirstlane_b32 s5, v0 179; GFX9-NEXT: s_mul_i32 s4, s4, s5 180; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 181; GFX9-NEXT: s_add_i32 s5, s5, s4 182; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 183; GFX9-NEXT: s_mul_i32 s4, s4, s3 184; GFX9-NEXT: s_sub_i32 s2, s2, s4 185; GFX9-NEXT: s_sub_i32 s4, s2, s3 186; GFX9-NEXT: s_cmp_ge_u32 s2, s3 187; GFX9-NEXT: s_cselect_b32 s2, s4, s2 188; GFX9-NEXT: s_sub_i32 s4, s2, s3 189; GFX9-NEXT: s_cmp_ge_u32 s2, s3 190; GFX9-NEXT: s_cselect_b32 s2, s4, s2 191; GFX9-NEXT: v_mov_b32_e32 v0, s2 192; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 193; GFX9-NEXT: s_endpgm 194 %r = urem i32 %x, %y 195 store i32 %r, ptr addrspace(1) %out 196 ret void 197} 198 199define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { 200; CHECK-LABEL: @sdiv_i32( 201; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 202; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 203; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 204; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 205; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 206; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 207; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 208; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 209; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 210; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 211; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 212; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 213; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 214; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 215; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 216; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 217; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 218; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 219; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 220; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 221; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 222; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 223; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 224; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 225; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 226; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 227; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 228; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 229; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 230; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 231; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 232; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 233; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 234; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 235; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 236; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 237; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 238; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 239; CHECK-NEXT: store i32 [[TMP38]], ptr addrspace(1) [[OUT:%.*]], align 4 240; CHECK-NEXT: ret void 241; 242; GFX6-LABEL: sdiv_i32: 243; GFX6: ; %bb.0: 244; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 245; GFX6-NEXT: s_mov_b32 s7, 0xf000 246; GFX6-NEXT: s_mov_b32 s6, -1 247; GFX6-NEXT: s_waitcnt lgkmcnt(0) 248; GFX6-NEXT: s_abs_i32 s8, s3 249; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 250; GFX6-NEXT: s_sub_i32 s4, 0, s8 251; GFX6-NEXT: s_mov_b32 s5, s1 252; GFX6-NEXT: s_xor_b32 s1, s2, s3 253; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 254; GFX6-NEXT: s_ashr_i32 s1, s1, 31 255; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 256; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 257; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 258; GFX6-NEXT: s_mov_b32 s4, s0 259; GFX6-NEXT: s_abs_i32 s0, s2 260; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 261; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 262; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 263; GFX6-NEXT: v_readfirstlane_b32 s2, v0 264; GFX6-NEXT: s_mul_i32 s2, s2, s8 265; GFX6-NEXT: s_sub_i32 s0, s0, s2 266; GFX6-NEXT: s_sub_i32 s2, s0, s8 267; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 268; GFX6-NEXT: s_cmp_ge_u32 s0, s8 269; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 270; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 271; GFX6-NEXT: s_cselect_b32 s0, s2, s0 272; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 273; GFX6-NEXT: s_cmp_ge_u32 s0, s8 274; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 275; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 276; GFX6-NEXT: v_xor_b32_e32 v0, s1, v0 277; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0 278; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 279; GFX6-NEXT: s_endpgm 280; 281; GFX9-LABEL: sdiv_i32: 282; GFX9: ; %bb.0: 283; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 284; GFX9-NEXT: v_mov_b32_e32 v1, 0 285; GFX9-NEXT: s_waitcnt lgkmcnt(0) 286; GFX9-NEXT: s_abs_i32 s4, s3 287; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 288; GFX9-NEXT: s_sub_i32 s5, 0, s4 289; GFX9-NEXT: s_xor_b32 s3, s2, s3 290; GFX9-NEXT: s_abs_i32 s2, s2 291; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 292; GFX9-NEXT: s_ashr_i32 s3, s3, 31 293; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 294; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 295; GFX9-NEXT: v_readfirstlane_b32 s6, v0 296; GFX9-NEXT: s_mul_i32 s5, s5, s6 297; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 298; GFX9-NEXT: s_add_i32 s6, s6, s5 299; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 300; GFX9-NEXT: s_mul_i32 s6, s5, s4 301; GFX9-NEXT: s_sub_i32 s2, s2, s6 302; GFX9-NEXT: s_add_i32 s7, s5, 1 303; GFX9-NEXT: s_sub_i32 s6, s2, s4 304; GFX9-NEXT: s_cmp_ge_u32 s2, s4 305; GFX9-NEXT: s_cselect_b32 s5, s7, s5 306; GFX9-NEXT: s_cselect_b32 s2, s6, s2 307; GFX9-NEXT: s_add_i32 s6, s5, 1 308; GFX9-NEXT: s_cmp_ge_u32 s2, s4 309; GFX9-NEXT: s_cselect_b32 s2, s6, s5 310; GFX9-NEXT: s_xor_b32 s2, s2, s3 311; GFX9-NEXT: s_sub_i32 s2, s2, s3 312; GFX9-NEXT: v_mov_b32_e32 v0, s2 313; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 314; GFX9-NEXT: s_endpgm 315 %r = sdiv i32 %x, %y 316 store i32 %r, ptr addrspace(1) %out 317 ret void 318} 319 320define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { 321; CHECK-LABEL: @srem_i32( 322; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 323; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 324; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 325; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 326; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 327; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 328; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 329; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 330; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 331; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 332; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 333; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 334; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 335; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 336; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 337; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 338; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 339; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 340; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 341; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 342; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 343; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 344; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 345; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 346; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 347; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 348; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 349; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 350; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 351; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 352; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 353; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 354; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 355; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 356; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 357; CHECK-NEXT: store i32 [[TMP35]], ptr addrspace(1) [[OUT:%.*]], align 4 358; CHECK-NEXT: ret void 359; 360; GFX6-LABEL: srem_i32: 361; GFX6: ; %bb.0: 362; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 363; GFX6-NEXT: s_mov_b32 s7, 0xf000 364; GFX6-NEXT: s_mov_b32 s6, -1 365; GFX6-NEXT: s_waitcnt lgkmcnt(0) 366; GFX6-NEXT: s_abs_i32 s3, s3 367; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 368; GFX6-NEXT: s_sub_i32 s4, 0, s3 369; GFX6-NEXT: s_abs_i32 s8, s2 370; GFX6-NEXT: s_mov_b32 s5, s1 371; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 372; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 373; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 374; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 375; GFX6-NEXT: s_mov_b32 s4, s0 376; GFX6-NEXT: s_ashr_i32 s0, s2, 31 377; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 378; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 379; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 380; GFX6-NEXT: v_readfirstlane_b32 s1, v0 381; GFX6-NEXT: s_mul_i32 s1, s1, s3 382; GFX6-NEXT: s_sub_i32 s1, s8, s1 383; GFX6-NEXT: s_sub_i32 s2, s1, s3 384; GFX6-NEXT: s_cmp_ge_u32 s1, s3 385; GFX6-NEXT: s_cselect_b32 s1, s2, s1 386; GFX6-NEXT: s_sub_i32 s2, s1, s3 387; GFX6-NEXT: s_cmp_ge_u32 s1, s3 388; GFX6-NEXT: s_cselect_b32 s1, s2, s1 389; GFX6-NEXT: s_xor_b32 s1, s1, s0 390; GFX6-NEXT: s_sub_i32 s0, s1, s0 391; GFX6-NEXT: v_mov_b32_e32 v0, s0 392; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 393; GFX6-NEXT: s_endpgm 394; 395; GFX9-LABEL: srem_i32: 396; GFX9: ; %bb.0: 397; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 398; GFX9-NEXT: v_mov_b32_e32 v1, 0 399; GFX9-NEXT: s_waitcnt lgkmcnt(0) 400; GFX9-NEXT: s_abs_i32 s3, s3 401; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 402; GFX9-NEXT: s_sub_i32 s5, 0, s3 403; GFX9-NEXT: s_ashr_i32 s4, s2, 31 404; GFX9-NEXT: s_abs_i32 s2, s2 405; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 406; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 407; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 408; GFX9-NEXT: v_readfirstlane_b32 s6, v0 409; GFX9-NEXT: s_mul_i32 s5, s5, s6 410; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 411; GFX9-NEXT: s_add_i32 s6, s6, s5 412; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 413; GFX9-NEXT: s_mul_i32 s5, s5, s3 414; GFX9-NEXT: s_sub_i32 s2, s2, s5 415; GFX9-NEXT: s_sub_i32 s5, s2, s3 416; GFX9-NEXT: s_cmp_ge_u32 s2, s3 417; GFX9-NEXT: s_cselect_b32 s2, s5, s2 418; GFX9-NEXT: s_sub_i32 s5, s2, s3 419; GFX9-NEXT: s_cmp_ge_u32 s2, s3 420; GFX9-NEXT: s_cselect_b32 s2, s5, s2 421; GFX9-NEXT: s_xor_b32 s2, s2, s4 422; GFX9-NEXT: s_sub_i32 s2, s2, s4 423; GFX9-NEXT: v_mov_b32_e32 v0, s2 424; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 425; GFX9-NEXT: s_endpgm 426 %r = srem i32 %x, %y 427 store i32 %r, ptr addrspace(1) %out 428 ret void 429} 430 431define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { 432; CHECK-LABEL: @udiv_i16( 433; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 434; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 435; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 436; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 437; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 438; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 439; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 440; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 441; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 442; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 443; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 444; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 445; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 446; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 447; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 448; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 449; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 450; CHECK-NEXT: store i16 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 2 451; CHECK-NEXT: ret void 452; 453; GFX6-LABEL: udiv_i16: 454; GFX6: ; %bb.0: 455; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb 456; GFX6-NEXT: s_mov_b32 s3, 0xf000 457; GFX6-NEXT: s_mov_b32 s2, -1 458; GFX6-NEXT: s_waitcnt lgkmcnt(0) 459; GFX6-NEXT: s_lshr_b32 s1, s0, 16 460; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 461; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 462; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 463; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 464; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 465; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 466; GFX6-NEXT: v_trunc_f32_e32 v2, v2 467; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 468; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 469; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 470; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 471; GFX6-NEXT: s_waitcnt lgkmcnt(0) 472; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 473; GFX6-NEXT: s_endpgm 474; 475; GFX9-LABEL: udiv_i16: 476; GFX9: ; %bb.0: 477; GFX9-NEXT: s_load_dword s0, s[4:5], 0x2c 478; GFX9-NEXT: v_mov_b32_e32 v3, 0 479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 480; GFX9-NEXT: s_lshr_b32 s1, s0, 16 481; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 482; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 483; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 484; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 485; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 486; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 487; GFX9-NEXT: v_trunc_f32_e32 v2, v2 488; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 489; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 490; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 491; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 493; GFX9-NEXT: global_store_short v3, v0, s[0:1] 494; GFX9-NEXT: s_endpgm 495 %r = udiv i16 %x, %y 496 store i16 %r, ptr addrspace(1) %out 497 ret void 498} 499 500define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { 501; CHECK-LABEL: @urem_i16( 502; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 503; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 504; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 505; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 506; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 507; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 508; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 509; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 510; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 511; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 512; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 513; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 514; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 515; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 516; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 517; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 518; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 519; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 520; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 521; CHECK-NEXT: store i16 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 2 522; CHECK-NEXT: ret void 523; 524; GFX6-LABEL: urem_i16: 525; GFX6: ; %bb.0: 526; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 527; GFX6-NEXT: s_mov_b32 s3, 0xf000 528; GFX6-NEXT: s_waitcnt lgkmcnt(0) 529; GFX6-NEXT: s_lshr_b32 s2, s6, 16 530; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 531; GFX6-NEXT: s_and_b32 s0, s6, 0xffff 532; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 533; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 534; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 535; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 536; GFX6-NEXT: v_trunc_f32_e32 v2, v2 537; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 538; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 539; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 540; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 541; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 542; GFX6-NEXT: s_mov_b32 s2, -1 543; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 544; GFX6-NEXT: s_waitcnt lgkmcnt(0) 545; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 546; GFX6-NEXT: s_endpgm 547; 548; GFX9-LABEL: urem_i16: 549; GFX9: ; %bb.0: 550; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 551; GFX9-NEXT: s_waitcnt lgkmcnt(0) 552; GFX9-NEXT: s_lshr_b32 s3, s2, 16 553; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 554; GFX9-NEXT: s_and_b32 s0, s2, 0xffff 555; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 556; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 557; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 558; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 559; GFX9-NEXT: v_trunc_f32_e32 v2, v2 560; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 561; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 562; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 563; GFX9-NEXT: v_mov_b32_e32 v1, 0 564; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 565; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 566; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 567; GFX9-NEXT: s_waitcnt lgkmcnt(0) 568; GFX9-NEXT: global_store_short v1, v0, s[0:1] 569; GFX9-NEXT: s_endpgm 570 %r = urem i16 %x, %y 571 store i16 %r, ptr addrspace(1) %out 572 ret void 573} 574 575define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { 576; CHECK-LABEL: @sdiv_i16( 577; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 578; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 579; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 580; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 581; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 582; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 583; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 584; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 585; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 586; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 587; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 588; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 589; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 590; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 591; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 592; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 593; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 594; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 595; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 596; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 597; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 598; CHECK-NEXT: store i16 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 2 599; CHECK-NEXT: ret void 600; 601; GFX6-LABEL: sdiv_i16: 602; GFX6: ; %bb.0: 603; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 604; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 605; GFX6-NEXT: s_mov_b32 s3, 0xf000 606; GFX6-NEXT: s_mov_b32 s2, -1 607; GFX6-NEXT: s_waitcnt lgkmcnt(0) 608; GFX6-NEXT: s_ashr_i32 s4, s6, 16 609; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 610; GFX6-NEXT: s_sext_i32_i16 s5, s6 611; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 612; GFX6-NEXT: s_xor_b32 s4, s5, s4 613; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 614; GFX6-NEXT: s_ashr_i32 s4, s4, 30 615; GFX6-NEXT: s_or_b32 s6, s4, 1 616; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 617; GFX6-NEXT: v_trunc_f32_e32 v2, v2 618; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 619; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 620; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 621; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 622; GFX6-NEXT: s_cselect_b32 s4, s6, 0 623; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 624; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 625; GFX6-NEXT: s_endpgm 626; 627; GFX9-LABEL: sdiv_i16: 628; GFX9: ; %bb.0: 629; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 630; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 631; GFX9-NEXT: v_mov_b32_e32 v1, 0 632; GFX9-NEXT: s_waitcnt lgkmcnt(0) 633; GFX9-NEXT: s_ashr_i32 s3, s2, 16 634; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 635; GFX9-NEXT: s_sext_i32_i16 s2, s2 636; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 637; GFX9-NEXT: s_xor_b32 s2, s2, s3 638; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 639; GFX9-NEXT: s_ashr_i32 s2, s2, 30 640; GFX9-NEXT: s_or_b32 s4, s2, 1 641; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 642; GFX9-NEXT: v_trunc_f32_e32 v3, v3 643; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 644; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 645; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| 646; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 647; GFX9-NEXT: s_cselect_b32 s2, s4, 0 648; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 649; GFX9-NEXT: global_store_short v1, v0, s[0:1] 650; GFX9-NEXT: s_endpgm 651 %r = sdiv i16 %x, %y 652 store i16 %r, ptr addrspace(1) %out 653 ret void 654} 655 656define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) { 657; CHECK-LABEL: @srem_i16( 658; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 659; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 660; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 661; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 662; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 663; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 664; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 665; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 666; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 667; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 668; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 669; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 670; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 671; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 672; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 673; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 674; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 675; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 676; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 677; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 678; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 679; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 680; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 681; CHECK-NEXT: store i16 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 2 682; CHECK-NEXT: ret void 683; 684; GFX6-LABEL: srem_i16: 685; GFX6: ; %bb.0: 686; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 687; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 688; GFX6-NEXT: s_waitcnt lgkmcnt(0) 689; GFX6-NEXT: s_ashr_i32 s7, s6, 16 690; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s7 691; GFX6-NEXT: s_sext_i32_i16 s2, s6 692; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 693; GFX6-NEXT: s_xor_b32 s2, s2, s7 694; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 695; GFX6-NEXT: s_ashr_i32 s2, s2, 30 696; GFX6-NEXT: s_or_b32 s4, s2, 1 697; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 698; GFX6-NEXT: v_trunc_f32_e32 v2, v2 699; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 700; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 701; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 702; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec 703; GFX6-NEXT: s_cselect_b32 s2, s4, 0 704; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 705; GFX6-NEXT: v_mul_lo_u32 v0, v0, s7 706; GFX6-NEXT: s_mov_b32 s3, 0xf000 707; GFX6-NEXT: s_mov_b32 s2, -1 708; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 709; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 710; GFX6-NEXT: s_endpgm 711; 712; GFX9-LABEL: srem_i16: 713; GFX9: ; %bb.0: 714; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 715; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 717; GFX9-NEXT: s_ashr_i32 s7, s6, 16 718; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s7 719; GFX9-NEXT: s_sext_i32_i16 s2, s6 720; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 721; GFX9-NEXT: s_xor_b32 s2, s2, s7 722; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 723; GFX9-NEXT: s_ashr_i32 s2, s2, 30 724; GFX9-NEXT: s_or_b32 s4, s2, 1 725; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 726; GFX9-NEXT: v_trunc_f32_e32 v2, v2 727; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 728; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 729; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 730; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 731; GFX9-NEXT: s_cselect_b32 s2, s4, 0 732; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 733; GFX9-NEXT: v_mul_lo_u32 v0, v0, s7 734; GFX9-NEXT: v_mov_b32_e32 v1, 0 735; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 736; GFX9-NEXT: global_store_short v1, v0, s[0:1] 737; GFX9-NEXT: s_endpgm 738 %r = srem i16 %x, %y 739 store i16 %r, ptr addrspace(1) %out 740 ret void 741} 742 743define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { 744; CHECK-LABEL: @udiv_i8( 745; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 746; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 747; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 748; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 749; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 750; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 751; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 752; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 753; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 754; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 755; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 756; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 757; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 758; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 759; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 760; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 761; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 762; CHECK-NEXT: store i8 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1 763; CHECK-NEXT: ret void 764; 765; GFX6-LABEL: udiv_i8: 766; GFX6: ; %bb.0: 767; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 768; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 769; GFX6-NEXT: s_mov_b32 s3, 0xf000 770; GFX6-NEXT: s_mov_b32 s2, -1 771; GFX6-NEXT: s_waitcnt lgkmcnt(0) 772; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 773; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 774; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 775; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 776; GFX6-NEXT: v_trunc_f32_e32 v1, v1 777; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 778; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 779; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 780; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 781; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 782; GFX6-NEXT: s_endpgm 783; 784; GFX9-LABEL: udiv_i8: 785; GFX9: ; %bb.0: 786; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 787; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 788; GFX9-NEXT: v_mov_b32_e32 v2, 0 789; GFX9-NEXT: s_waitcnt lgkmcnt(0) 790; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 791; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 792; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 793; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 794; GFX9-NEXT: v_trunc_f32_e32 v1, v1 795; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 796; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 797; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 798; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 799; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 800; GFX9-NEXT: s_endpgm 801 %r = udiv i8 %x, %y 802 store i8 %r, ptr addrspace(1) %out 803 ret void 804} 805 806define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { 807; CHECK-LABEL: @urem_i8( 808; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 809; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 810; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 811; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 812; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 813; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 814; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 815; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 816; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 817; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 818; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 819; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 820; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 821; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 822; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 823; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 824; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 825; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 826; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 827; CHECK-NEXT: store i8 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1 828; CHECK-NEXT: ret void 829; 830; GFX6-LABEL: urem_i8: 831; GFX6: ; %bb.0: 832; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 833; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 834; GFX6-NEXT: s_mov_b32 s3, 0xf000 835; GFX6-NEXT: s_waitcnt lgkmcnt(0) 836; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s6 837; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 838; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s6 839; GFX6-NEXT: s_lshr_b32 s2, s6, 8 840; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 841; GFX6-NEXT: v_trunc_f32_e32 v1, v1 842; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 843; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 844; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 845; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 846; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 847; GFX6-NEXT: s_mov_b32 s2, -1 848; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 849; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 850; GFX6-NEXT: s_endpgm 851; 852; GFX9-LABEL: urem_i8: 853; GFX9: ; %bb.0: 854; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 855; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 856; GFX9-NEXT: s_waitcnt lgkmcnt(0) 857; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 858; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 859; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 860; GFX9-NEXT: s_lshr_b32 s3, s2, 8 861; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 862; GFX9-NEXT: v_trunc_f32_e32 v1, v1 863; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 864; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 865; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 866; GFX9-NEXT: v_mov_b32_e32 v1, 0 867; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 868; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 869; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 870; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 871; GFX9-NEXT: s_endpgm 872 %r = urem i8 %x, %y 873 store i8 %r, ptr addrspace(1) %out 874 ret void 875} 876 877define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { 878; CHECK-LABEL: @sdiv_i8( 879; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 880; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 881; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 882; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 883; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 884; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 885; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 886; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 887; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 888; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 889; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 890; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 891; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 892; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 893; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 894; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 895; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 896; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 897; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 898; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 899; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 900; CHECK-NEXT: store i8 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1 901; CHECK-NEXT: ret void 902; 903; GFX6-LABEL: sdiv_i8: 904; GFX6: ; %bb.0: 905; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 906; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 907; GFX6-NEXT: s_mov_b32 s3, 0xf000 908; GFX6-NEXT: s_mov_b32 s2, -1 909; GFX6-NEXT: s_waitcnt lgkmcnt(0) 910; GFX6-NEXT: s_bfe_i32 s4, s6, 0x80008 911; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 912; GFX6-NEXT: s_sext_i32_i8 s5, s6 913; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 914; GFX6-NEXT: s_xor_b32 s4, s5, s4 915; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 916; GFX6-NEXT: s_ashr_i32 s4, s4, 30 917; GFX6-NEXT: s_or_b32 s6, s4, 1 918; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 919; GFX6-NEXT: v_trunc_f32_e32 v2, v2 920; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 921; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 922; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 923; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 924; GFX6-NEXT: s_cselect_b32 s4, s6, 0 925; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 926; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 927; GFX6-NEXT: s_endpgm 928; 929; GFX9-LABEL: sdiv_i8: 930; GFX9: ; %bb.0: 931; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 932; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 933; GFX9-NEXT: v_mov_b32_e32 v1, 0 934; GFX9-NEXT: s_waitcnt lgkmcnt(0) 935; GFX9-NEXT: s_bfe_i32 s3, s2, 0x80008 936; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 937; GFX9-NEXT: s_sext_i32_i8 s2, s2 938; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 939; GFX9-NEXT: s_xor_b32 s2, s2, s3 940; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 941; GFX9-NEXT: s_ashr_i32 s2, s2, 30 942; GFX9-NEXT: s_or_b32 s4, s2, 1 943; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 944; GFX9-NEXT: v_trunc_f32_e32 v3, v3 945; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 946; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 947; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| 948; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 949; GFX9-NEXT: s_cselect_b32 s2, s4, 0 950; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 951; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 952; GFX9-NEXT: s_endpgm 953 %r = sdiv i8 %x, %y 954 store i8 %r, ptr addrspace(1) %out 955 ret void 956} 957 958define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) { 959; CHECK-LABEL: @srem_i8( 960; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 961; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 962; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 963; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 964; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 965; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 966; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 967; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 968; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 969; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 970; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 971; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 972; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 973; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 974; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 975; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 976; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 977; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 978; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 979; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 980; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 981; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 982; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 983; CHECK-NEXT: store i8 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1 984; CHECK-NEXT: ret void 985; 986; GFX6-LABEL: srem_i8: 987; GFX6: ; %bb.0: 988; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 989; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 990; GFX6-NEXT: s_waitcnt lgkmcnt(0) 991; GFX6-NEXT: s_bfe_i32 s2, s6, 0x80008 992; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 993; GFX6-NEXT: s_sext_i32_i8 s3, s6 994; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 995; GFX6-NEXT: s_xor_b32 s2, s3, s2 996; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 997; GFX6-NEXT: s_ashr_i32 s2, s2, 30 998; GFX6-NEXT: s_lshr_b32 s4, s6, 8 999; GFX6-NEXT: s_or_b32 s5, s2, 1 1000; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1001; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1002; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1003; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1004; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1005; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec 1006; GFX6-NEXT: s_cselect_b32 s2, s5, 0 1007; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 1008; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 1009; GFX6-NEXT: s_mov_b32 s3, 0xf000 1010; GFX6-NEXT: s_mov_b32 s2, -1 1011; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 1012; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1013; GFX6-NEXT: s_endpgm 1014; 1015; GFX9-LABEL: srem_i8: 1016; GFX9: ; %bb.0: 1017; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c 1018; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1019; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1020; GFX9-NEXT: s_bfe_i32 s2, s6, 0x80008 1021; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 1022; GFX9-NEXT: s_sext_i32_i8 s3, s6 1023; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 1024; GFX9-NEXT: s_xor_b32 s2, s3, s2 1025; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1026; GFX9-NEXT: s_ashr_i32 s2, s2, 30 1027; GFX9-NEXT: s_lshr_b32 s4, s6, 8 1028; GFX9-NEXT: s_or_b32 s5, s2, 1 1029; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1030; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1031; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1032; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1033; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1034; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 1035; GFX9-NEXT: s_cselect_b32 s2, s5, 0 1036; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 1037; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 1038; GFX9-NEXT: v_mov_b32_e32 v1, 0 1039; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 1040; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1041; GFX9-NEXT: s_endpgm 1042 %r = srem i8 %x, %y 1043 store i8 %r, ptr addrspace(1) %out 1044 ret void 1045} 1046 1047define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) { 1048; CHECK-LABEL: @udiv_v4i32( 1049; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1050; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1051; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1052; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1053; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1054; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1055; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1056; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1057; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1058; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1059; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1060; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1061; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1062; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1063; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1064; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1065; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1066; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1067; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1068; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1069; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1070; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1071; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1072; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1073; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1074; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1075; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1076; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1077; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1078; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1079; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1080; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> poison, i32 [[TMP31]], i64 0 1081; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1082; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1083; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1084; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1085; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1086; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1087; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1088; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1089; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1090; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1091; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1092; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1093; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1094; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1095; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1096; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1097; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1098; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1099; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1100; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1101; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1102; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1103; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1104; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1105; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1106; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1107; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1108; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1109; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1110; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1111; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1112; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1113; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1114; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1115; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1116; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1117; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1118; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1119; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1120; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1121; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1122; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1123; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1124; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1125; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1126; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1127; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1128; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1129; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1130; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1131; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1132; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1133; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1134; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1135; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1136; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1137; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1138; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1139; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1140; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1141; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1142; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1143; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1144; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1145; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1146; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1147; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1148; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1149; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1150; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1151; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1152; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1153; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1154; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1155; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1156; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1157; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1158; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1159; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1160; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1161; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1162; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1163; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1164; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1165; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1166; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1167; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1168; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1169; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1170; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1171; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1172; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1173; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1174; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1175; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1176; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1177; CHECK-NEXT: store <4 x i32> [[TMP128]], ptr addrspace(1) [[OUT:%.*]], align 16 1178; CHECK-NEXT: ret void 1179; 1180; GFX6-LABEL: udiv_v4i32: 1181; GFX6: ; %bb.0: 1182; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 1183; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9 1184; GFX6-NEXT: s_mov_b32 s19, 0xf000 1185; GFX6-NEXT: s_mov_b32 s18, -1 1186; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 1188; GFX6-NEXT: s_sub_i32 s0, 0, s12 1189; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 1190; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s14 1191; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1192; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s15 1193; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 1194; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1195; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1196; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1197; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v6 1198; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 1199; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 1200; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1201; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 1202; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 1203; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1204; GFX6-NEXT: v_readfirstlane_b32 s0, v0 1205; GFX6-NEXT: s_mul_i32 s0, s0, s12 1206; GFX6-NEXT: s_sub_i32 s0, s8, s0 1207; GFX6-NEXT: s_sub_i32 s1, s0, s12 1208; GFX6-NEXT: s_cmp_ge_u32 s0, s12 1209; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 1210; GFX6-NEXT: s_cselect_b32 s0, s1, s0 1211; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1212; GFX6-NEXT: s_cmp_ge_u32 s0, s12 1213; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 1214; GFX6-NEXT: s_sub_i32 s2, 0, s13 1215; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1216; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1217; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 1218; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1219; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 1220; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1221; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 1222; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 1223; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1224; GFX6-NEXT: v_readfirstlane_b32 s2, v1 1225; GFX6-NEXT: s_mul_i32 s2, s2, s13 1226; GFX6-NEXT: s_sub_i32 s2, s9, s2 1227; GFX6-NEXT: s_sub_i32 s3, s2, s13 1228; GFX6-NEXT: s_cmp_ge_u32 s2, s13 1229; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1230; GFX6-NEXT: s_cselect_b32 s2, s3, s2 1231; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1232; GFX6-NEXT: s_cmp_ge_u32 s2, s13 1233; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 1234; GFX6-NEXT: s_sub_i32 s6, 0, s14 1235; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 1236; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1237; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1238; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 1239; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 1240; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 1241; GFX6-NEXT: v_mul_hi_u32 v3, s10, v3 1242; GFX6-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v6 1243; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 1244; GFX6-NEXT: v_readfirstlane_b32 s6, v3 1245; GFX6-NEXT: s_mul_i32 s6, s6, s14 1246; GFX6-NEXT: s_sub_i32 s6, s10, s6 1247; GFX6-NEXT: s_sub_i32 s7, s6, s14 1248; GFX6-NEXT: s_cmp_ge_u32 s6, s14 1249; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 1250; GFX6-NEXT: s_cselect_b32 s6, s7, s6 1251; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1252; GFX6-NEXT: s_cmp_ge_u32 s6, s14 1253; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 1254; GFX6-NEXT: s_sub_i32 s8, 0, s15 1255; GFX6-NEXT: v_mul_lo_u32 v7, s8, v5 1256; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1257; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v3 1258; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 1259; GFX6-NEXT: v_cndmask_b32_e64 v2, v3, v6, s[6:7] 1260; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 1261; GFX6-NEXT: v_mul_hi_u32 v5, s11, v5 1262; GFX6-NEXT: v_readfirstlane_b32 s0, v5 1263; GFX6-NEXT: s_mul_i32 s0, s0, s15 1264; GFX6-NEXT: s_sub_i32 s0, s11, s0 1265; GFX6-NEXT: s_sub_i32 s1, s0, s15 1266; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v5 1267; GFX6-NEXT: s_cmp_ge_u32 s0, s15 1268; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1269; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 1270; GFX6-NEXT: s_cselect_b32 s0, s1, s0 1271; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 1272; GFX6-NEXT: s_cmp_ge_u32 s0, s15 1273; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1274; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1275; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 1276; GFX6-NEXT: s_endpgm 1277; 1278; GFX9-LABEL: udiv_v4i32: 1279; GFX9: ; %bb.0: 1280; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1281; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1282; GFX9-NEXT: v_mov_b32_e32 v4, 0 1283; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1284; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 1285; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 1286; GFX9-NEXT: s_sub_i32 s2, 0, s12 1287; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 1288; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1289; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1290; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1291; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1292; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1293; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1294; GFX9-NEXT: v_readfirstlane_b32 s3, v0 1295; GFX9-NEXT: s_mul_i32 s2, s2, s3 1296; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 1297; GFX9-NEXT: s_add_i32 s3, s3, s2 1298; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3 1299; GFX9-NEXT: s_mul_i32 s3, s2, s12 1300; GFX9-NEXT: s_sub_i32 s3, s8, s3 1301; GFX9-NEXT: s_add_i32 s5, s2, 1 1302; GFX9-NEXT: s_sub_i32 s6, s3, s12 1303; GFX9-NEXT: s_cmp_ge_u32 s3, s12 1304; GFX9-NEXT: s_cselect_b32 s2, s5, s2 1305; GFX9-NEXT: s_cselect_b32 s3, s6, s3 1306; GFX9-NEXT: s_add_i32 s5, s2, 1 1307; GFX9-NEXT: s_cmp_ge_u32 s3, s12 1308; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1309; GFX9-NEXT: s_cselect_b32 s2, s5, s2 1310; GFX9-NEXT: s_sub_i32 s3, 0, s13 1311; GFX9-NEXT: s_mul_i32 s3, s3, s4 1312; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 1313; GFX9-NEXT: s_add_i32 s4, s4, s3 1314; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2 1315; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4 1316; GFX9-NEXT: s_mul_i32 s4, s3, s13 1317; GFX9-NEXT: s_sub_i32 s4, s9, s4 1318; GFX9-NEXT: s_add_i32 s5, s3, 1 1319; GFX9-NEXT: s_sub_i32 s6, s4, s13 1320; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1321; GFX9-NEXT: s_cmp_ge_u32 s4, s13 1322; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1323; GFX9-NEXT: s_cselect_b32 s3, s5, s3 1324; GFX9-NEXT: s_cselect_b32 s4, s6, s4 1325; GFX9-NEXT: s_add_i32 s5, s3, 1 1326; GFX9-NEXT: s_cmp_ge_u32 s4, s13 1327; GFX9-NEXT: s_cselect_b32 s3, s5, s3 1328; GFX9-NEXT: v_readfirstlane_b32 s5, v0 1329; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 1330; GFX9-NEXT: s_sub_i32 s4, 0, s14 1331; GFX9-NEXT: s_mul_i32 s4, s4, s5 1332; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 1333; GFX9-NEXT: s_add_i32 s5, s5, s4 1334; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1335; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 1336; GFX9-NEXT: s_mul_i32 s5, s4, s14 1337; GFX9-NEXT: s_sub_i32 s5, s10, s5 1338; GFX9-NEXT: s_add_i32 s6, s4, 1 1339; GFX9-NEXT: s_sub_i32 s7, s5, s14 1340; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1341; GFX9-NEXT: s_cmp_ge_u32 s5, s14 1342; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1343; GFX9-NEXT: s_cselect_b32 s4, s6, s4 1344; GFX9-NEXT: s_cselect_b32 s5, s7, s5 1345; GFX9-NEXT: s_add_i32 s6, s4, 1 1346; GFX9-NEXT: s_cmp_ge_u32 s5, s14 1347; GFX9-NEXT: s_cselect_b32 s4, s6, s4 1348; GFX9-NEXT: s_sub_i32 s5, 0, s15 1349; GFX9-NEXT: v_readfirstlane_b32 s6, v0 1350; GFX9-NEXT: s_mul_i32 s5, s5, s6 1351; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 1352; GFX9-NEXT: s_add_i32 s6, s6, s5 1353; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6 1354; GFX9-NEXT: s_mul_i32 s6, s5, s15 1355; GFX9-NEXT: s_sub_i32 s6, s11, s6 1356; GFX9-NEXT: s_add_i32 s7, s5, 1 1357; GFX9-NEXT: s_sub_i32 s8, s6, s15 1358; GFX9-NEXT: s_cmp_ge_u32 s6, s15 1359; GFX9-NEXT: s_cselect_b32 s5, s7, s5 1360; GFX9-NEXT: s_cselect_b32 s6, s8, s6 1361; GFX9-NEXT: s_add_i32 s7, s5, 1 1362; GFX9-NEXT: s_cmp_ge_u32 s6, s15 1363; GFX9-NEXT: s_cselect_b32 s5, s7, s5 1364; GFX9-NEXT: v_mov_b32_e32 v0, s2 1365; GFX9-NEXT: v_mov_b32_e32 v1, s3 1366; GFX9-NEXT: v_mov_b32_e32 v2, s4 1367; GFX9-NEXT: v_mov_b32_e32 v3, s5 1368; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1369; GFX9-NEXT: s_endpgm 1370 %r = udiv <4 x i32> %x, %y 1371 store <4 x i32> %r, ptr addrspace(1) %out 1372 ret void 1373} 1374 1375define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) { 1376; CHECK-LABEL: @urem_v4i32( 1377; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1378; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1379; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1380; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1381; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1382; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1383; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1384; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1385; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1386; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1387; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1388; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1389; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1390; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1391; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1392; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1393; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1394; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1395; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1396; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1397; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1398; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1399; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1400; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1401; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1402; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1403; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1404; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1405; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1406; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i64 0 1407; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1408; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1409; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1410; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1411; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1412; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1413; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1414; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1415; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1416; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1417; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1418; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1419; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1420; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1421; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1422; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1423; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1424; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1425; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1426; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1427; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1428; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1429; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1430; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1431; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1432; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1433; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1434; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1435; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1436; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1437; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1438; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1439; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1440; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1441; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1442; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1443; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1444; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1445; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1446; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1447; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1448; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1449; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1450; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1451; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1452; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1453; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1454; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1455; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1456; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1457; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1458; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1459; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1460; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1461; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1462; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1463; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1464; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1465; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1466; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1467; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1468; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1469; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1470; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1471; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1472; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1473; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1474; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1475; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1476; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1477; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1478; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1479; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1480; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1481; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1482; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1483; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1484; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1485; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1486; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1487; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1488; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1489; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1490; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1491; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1492; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1493; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1494; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1495; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1496; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1497; CHECK-NEXT: store <4 x i32> [[TMP120]], ptr addrspace(1) [[OUT:%.*]], align 16 1498; CHECK-NEXT: ret void 1499; 1500; GFX6-LABEL: urem_v4i32: 1501; GFX6: ; %bb.0: 1502; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 1503; GFX6-NEXT: s_mov_b32 s3, 0xf000 1504; GFX6-NEXT: s_mov_b32 s2, -1 1505; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1506; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 1507; GFX6-NEXT: s_sub_i32 s0, 0, s12 1508; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s13 1509; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1510; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 1511; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1512; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1513; GFX6-NEXT: v_mul_lo_u32 v1, s0, v0 1514; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 1515; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1516; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 1517; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 1518; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1519; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s14 1520; GFX6-NEXT: v_readfirstlane_b32 s0, v0 1521; GFX6-NEXT: s_mul_i32 s0, s0, s12 1522; GFX6-NEXT: s_sub_i32 s0, s8, s0 1523; GFX6-NEXT: s_sub_i32 s1, s0, s12 1524; GFX6-NEXT: s_cmp_ge_u32 s0, s12 1525; GFX6-NEXT: s_cselect_b32 s0, s1, s0 1526; GFX6-NEXT: s_sub_i32 s1, s0, s12 1527; GFX6-NEXT: s_cmp_ge_u32 s0, s12 1528; GFX6-NEXT: s_cselect_b32 s6, s1, s0 1529; GFX6-NEXT: s_sub_i32 s0, 0, s13 1530; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 1531; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 1532; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 1533; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1534; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 1535; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 1536; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1537; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s15 1538; GFX6-NEXT: v_readfirstlane_b32 s0, v0 1539; GFX6-NEXT: s_mul_i32 s0, s0, s13 1540; GFX6-NEXT: s_sub_i32 s0, s9, s0 1541; GFX6-NEXT: s_sub_i32 s1, s0, s13 1542; GFX6-NEXT: s_cmp_ge_u32 s0, s13 1543; GFX6-NEXT: s_cselect_b32 s0, s1, s0 1544; GFX6-NEXT: s_sub_i32 s1, s0, s13 1545; GFX6-NEXT: s_cmp_ge_u32 s0, s13 1546; GFX6-NEXT: s_cselect_b32 s7, s1, s0 1547; GFX6-NEXT: s_sub_i32 s0, 0, s14 1548; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 1549; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 1550; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 1551; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1552; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 1553; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 1554; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1555; GFX6-NEXT: v_readfirstlane_b32 s0, v0 1556; GFX6-NEXT: s_mul_i32 s0, s0, s14 1557; GFX6-NEXT: s_sub_i32 s0, s10, s0 1558; GFX6-NEXT: s_sub_i32 s1, s0, s14 1559; GFX6-NEXT: s_cmp_ge_u32 s0, s14 1560; GFX6-NEXT: s_cselect_b32 s0, s1, s0 1561; GFX6-NEXT: s_sub_i32 s1, s0, s14 1562; GFX6-NEXT: s_cmp_ge_u32 s0, s14 1563; GFX6-NEXT: s_cselect_b32 s8, s1, s0 1564; GFX6-NEXT: s_sub_i32 s0, 0, s15 1565; GFX6-NEXT: v_mul_lo_u32 v0, s0, v1 1566; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1567; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 1568; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1569; GFX6-NEXT: v_mul_hi_u32 v2, s11, v0 1570; GFX6-NEXT: v_mov_b32_e32 v0, s6 1571; GFX6-NEXT: v_mov_b32_e32 v1, s7 1572; GFX6-NEXT: v_readfirstlane_b32 s4, v2 1573; GFX6-NEXT: s_mul_i32 s4, s4, s15 1574; GFX6-NEXT: s_sub_i32 s4, s11, s4 1575; GFX6-NEXT: s_sub_i32 s5, s4, s15 1576; GFX6-NEXT: s_cmp_ge_u32 s4, s15 1577; GFX6-NEXT: s_cselect_b32 s4, s5, s4 1578; GFX6-NEXT: s_sub_i32 s5, s4, s15 1579; GFX6-NEXT: s_cmp_ge_u32 s4, s15 1580; GFX6-NEXT: s_cselect_b32 s4, s5, s4 1581; GFX6-NEXT: v_mov_b32_e32 v2, s8 1582; GFX6-NEXT: v_mov_b32_e32 v3, s4 1583; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1584; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1585; GFX6-NEXT: s_endpgm 1586; 1587; GFX9-LABEL: urem_v4i32: 1588; GFX9: ; %bb.0: 1589; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1590; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1591; GFX9-NEXT: v_mov_b32_e32 v4, 0 1592; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 1594; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 1595; GFX9-NEXT: s_sub_i32 s2, 0, s12 1596; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s14 1597; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1598; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1599; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1600; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1601; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1602; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1603; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1604; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1605; GFX9-NEXT: v_readfirstlane_b32 s3, v0 1606; GFX9-NEXT: s_mul_i32 s2, s2, s3 1607; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 1608; GFX9-NEXT: s_add_i32 s3, s3, s2 1609; GFX9-NEXT: s_mul_hi_u32 s2, s8, s3 1610; GFX9-NEXT: s_mul_i32 s2, s2, s12 1611; GFX9-NEXT: s_sub_i32 s2, s8, s2 1612; GFX9-NEXT: s_sub_i32 s3, s2, s12 1613; GFX9-NEXT: s_cmp_ge_u32 s2, s12 1614; GFX9-NEXT: s_cselect_b32 s2, s3, s2 1615; GFX9-NEXT: s_sub_i32 s3, s2, s12 1616; GFX9-NEXT: s_cmp_ge_u32 s2, s12 1617; GFX9-NEXT: v_readfirstlane_b32 s4, v1 1618; GFX9-NEXT: s_cselect_b32 s2, s3, s2 1619; GFX9-NEXT: s_sub_i32 s3, 0, s13 1620; GFX9-NEXT: s_mul_i32 s3, s3, s4 1621; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 1622; GFX9-NEXT: s_add_i32 s4, s4, s3 1623; GFX9-NEXT: s_mul_hi_u32 s3, s9, s4 1624; GFX9-NEXT: s_mul_i32 s3, s3, s13 1625; GFX9-NEXT: s_sub_i32 s3, s9, s3 1626; GFX9-NEXT: s_sub_i32 s4, s3, s13 1627; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1628; GFX9-NEXT: s_cmp_ge_u32 s3, s13 1629; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1630; GFX9-NEXT: s_sub_i32 s4, s3, s13 1631; GFX9-NEXT: s_cmp_ge_u32 s3, s13 1632; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 1633; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1634; GFX9-NEXT: s_sub_i32 s4, 0, s14 1635; GFX9-NEXT: v_readfirstlane_b32 s5, v2 1636; GFX9-NEXT: s_mul_i32 s4, s4, s5 1637; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 1638; GFX9-NEXT: s_add_i32 s5, s5, s4 1639; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1640; GFX9-NEXT: s_mul_hi_u32 s4, s10, s5 1641; GFX9-NEXT: s_mul_i32 s4, s4, s14 1642; GFX9-NEXT: s_sub_i32 s4, s10, s4 1643; GFX9-NEXT: s_sub_i32 s5, s4, s14 1644; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1645; GFX9-NEXT: s_cmp_ge_u32 s4, s14 1646; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1647; GFX9-NEXT: s_cselect_b32 s4, s5, s4 1648; GFX9-NEXT: s_sub_i32 s5, s4, s14 1649; GFX9-NEXT: s_cmp_ge_u32 s4, s14 1650; GFX9-NEXT: s_cselect_b32 s4, s5, s4 1651; GFX9-NEXT: s_sub_i32 s5, 0, s15 1652; GFX9-NEXT: v_readfirstlane_b32 s6, v0 1653; GFX9-NEXT: s_mul_i32 s5, s5, s6 1654; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 1655; GFX9-NEXT: s_add_i32 s6, s6, s5 1656; GFX9-NEXT: s_mul_hi_u32 s5, s11, s6 1657; GFX9-NEXT: s_mul_i32 s5, s5, s15 1658; GFX9-NEXT: s_sub_i32 s5, s11, s5 1659; GFX9-NEXT: s_sub_i32 s6, s5, s15 1660; GFX9-NEXT: s_cmp_ge_u32 s5, s15 1661; GFX9-NEXT: s_cselect_b32 s5, s6, s5 1662; GFX9-NEXT: s_sub_i32 s6, s5, s15 1663; GFX9-NEXT: s_cmp_ge_u32 s5, s15 1664; GFX9-NEXT: s_cselect_b32 s5, s6, s5 1665; GFX9-NEXT: v_mov_b32_e32 v0, s2 1666; GFX9-NEXT: v_mov_b32_e32 v1, s3 1667; GFX9-NEXT: v_mov_b32_e32 v2, s4 1668; GFX9-NEXT: v_mov_b32_e32 v3, s5 1669; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1670; GFX9-NEXT: s_endpgm 1671 %r = urem <4 x i32> %x, %y 1672 store <4 x i32> %r, ptr addrspace(1) %out 1673 ret void 1674} 1675 1676define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) { 1677; CHECK-LABEL: @sdiv_v4i32( 1678; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1679; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1680; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1681; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1682; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1683; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1684; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1685; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1686; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1687; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1688; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1689; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1690; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1691; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1692; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1693; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1694; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1695; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1696; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1697; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1698; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1699; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1700; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1701; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1702; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1703; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1704; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1705; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1706; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1707; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1708; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1709; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1710; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1711; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1712; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1713; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1714; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1715; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1716; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1717; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1718; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP40]], i64 0 1719; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1720; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1721; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1722; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1723; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1724; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1725; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1726; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1727; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1728; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1729; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1730; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1731; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1732; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1733; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1734; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1735; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1736; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1737; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1738; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1739; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1740; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1741; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1742; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1743; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1744; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1745; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1746; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1747; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1748; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1749; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1750; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1751; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1752; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1753; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1754; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1755; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1756; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1757; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1758; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1759; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1760; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1761; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1762; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1763; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1764; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1765; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1766; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1767; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1768; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1769; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1770; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1771; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1772; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1773; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1774; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1775; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1776; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1777; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1778; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1779; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1780; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1781; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1782; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1783; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1784; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1785; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1786; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1787; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1788; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1789; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1790; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1791; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1792; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1793; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1794; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1795; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1796; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1797; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1798; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1799; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1800; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1801; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1802; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1803; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1804; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1805; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1806; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1807; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1808; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1809; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1810; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1811; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1812; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1813; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1814; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1815; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1816; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1817; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1818; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1819; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1820; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1821; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1822; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1823; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1824; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1825; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1826; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1827; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1828; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1829; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1830; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1831; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1832; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1833; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1834; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1835; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1836; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1837; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1838; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1839; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1840; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1841; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1842; CHECK-NEXT: store <4 x i32> [[TMP164]], ptr addrspace(1) [[OUT:%.*]], align 16 1843; CHECK-NEXT: ret void 1844; 1845; GFX6-LABEL: sdiv_v4i32: 1846; GFX6: ; %bb.0: 1847; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 1848; GFX6-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9 1849; GFX6-NEXT: s_mov_b32 s19, 0xf000 1850; GFX6-NEXT: s_mov_b32 s18, -1 1851; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1852; GFX6-NEXT: s_abs_i32 s0, s12 1853; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 1854; GFX6-NEXT: s_sub_i32 s1, 0, s0 1855; GFX6-NEXT: s_xor_b32 s2, s8, s12 1856; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1857; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1858; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1859; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 1860; GFX6-NEXT: s_abs_i32 s1, s8 1861; GFX6-NEXT: s_ashr_i32 s8, s2, 31 1862; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 1863; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1864; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 1865; GFX6-NEXT: v_readfirstlane_b32 s2, v0 1866; GFX6-NEXT: s_mul_i32 s2, s2, s0 1867; GFX6-NEXT: s_sub_i32 s1, s1, s2 1868; GFX6-NEXT: s_sub_i32 s2, s1, s0 1869; GFX6-NEXT: s_cmp_ge_u32 s1, s0 1870; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 1871; GFX6-NEXT: s_cselect_b32 s1, s2, s1 1872; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1873; GFX6-NEXT: s_cmp_ge_u32 s1, s0 1874; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 1875; GFX6-NEXT: s_abs_i32 s2, s13 1876; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 1877; GFX6-NEXT: s_sub_i32 s3, 0, s2 1878; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 1879; GFX6-NEXT: s_xor_b32 s6, s9, s13 1880; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 1881; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 1882; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] 1883; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1884; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1885; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 1886; GFX6-NEXT: v_mul_lo_u32 v3, s3, v2 1887; GFX6-NEXT: s_abs_i32 s3, s9 1888; GFX6-NEXT: s_ashr_i32 s9, s6, 31 1889; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1890; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1891; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2 1892; GFX6-NEXT: v_readfirstlane_b32 s6, v2 1893; GFX6-NEXT: s_mul_i32 s6, s6, s2 1894; GFX6-NEXT: s_sub_i32 s3, s3, s6 1895; GFX6-NEXT: s_sub_i32 s6, s3, s2 1896; GFX6-NEXT: s_cmp_ge_u32 s3, s2 1897; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 1898; GFX6-NEXT: s_cselect_b32 s3, s6, s3 1899; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1900; GFX6-NEXT: s_cmp_ge_u32 s3, s2 1901; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 1902; GFX6-NEXT: s_abs_i32 s6, s14 1903; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 1904; GFX6-NEXT: s_sub_i32 s7, 0, s6 1905; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1906; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v2 1907; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1908; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 1909; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1910; GFX6-NEXT: v_mul_lo_u32 v5, s7, v4 1911; GFX6-NEXT: s_abs_i32 s7, s10 1912; GFX6-NEXT: s_xor_b32 s10, s10, s14 1913; GFX6-NEXT: s_ashr_i32 s10, s10, 31 1914; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1915; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 1916; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1917; GFX6-NEXT: v_readfirstlane_b32 s12, v4 1918; GFX6-NEXT: s_mul_i32 s12, s12, s6 1919; GFX6-NEXT: s_sub_i32 s7, s7, s12 1920; GFX6-NEXT: s_sub_i32 s12, s7, s6 1921; GFX6-NEXT: s_cmp_ge_u32 s7, s6 1922; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1923; GFX6-NEXT: s_cselect_b32 s7, s12, s7 1924; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1925; GFX6-NEXT: s_cmp_ge_u32 s7, s6 1926; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 1927; GFX6-NEXT: s_abs_i32 s12, s15 1928; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 1929; GFX6-NEXT: s_sub_i32 s0, 0, s12 1930; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 1931; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1932; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v6 1933; GFX6-NEXT: s_abs_i32 s1, s11 1934; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 1935; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1936; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v1 1937; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[2:3] 1938; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v5, s[6:7] 1939; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 1940; GFX6-NEXT: v_mul_lo_u32 v2, s0, v6 1941; GFX6-NEXT: s_xor_b32 s0, s11, s15 1942; GFX6-NEXT: v_xor_b32_e32 v3, s10, v3 1943; GFX6-NEXT: s_ashr_i32 s0, s0, 31 1944; GFX6-NEXT: v_mul_hi_u32 v2, v6, v2 1945; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 1946; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 1947; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 1948; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v3 1949; GFX6-NEXT: v_readfirstlane_b32 s2, v4 1950; GFX6-NEXT: s_mul_i32 s2, s2, s12 1951; GFX6-NEXT: s_sub_i32 s1, s1, s2 1952; GFX6-NEXT: s_sub_i32 s2, s1, s12 1953; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v4 1954; GFX6-NEXT: s_cmp_ge_u32 s1, s12 1955; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1956; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 1957; GFX6-NEXT: s_cselect_b32 s1, s2, s1 1958; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v3 1959; GFX6-NEXT: s_cmp_ge_u32 s1, s12 1960; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 1961; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1962; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 1963; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 1964; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 1965; GFX6-NEXT: s_endpgm 1966; 1967; GFX9-LABEL: sdiv_v4i32: 1968; GFX9: ; %bb.0: 1969; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 1970; GFX9-NEXT: v_mov_b32_e32 v4, 0 1971; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1972; GFX9-NEXT: s_abs_i32 s0, s12 1973; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 1974; GFX9-NEXT: s_sub_i32 s3, 0, s0 1975; GFX9-NEXT: s_abs_i32 s2, s8 1976; GFX9-NEXT: s_xor_b32 s1, s8, s12 1977; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1978; GFX9-NEXT: s_ashr_i32 s1, s1, 31 1979; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1980; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1981; GFX9-NEXT: v_readfirstlane_b32 s6, v0 1982; GFX9-NEXT: s_mul_i32 s3, s3, s6 1983; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 1984; GFX9-NEXT: s_add_i32 s6, s6, s3 1985; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 1986; GFX9-NEXT: s_mul_i32 s6, s3, s0 1987; GFX9-NEXT: s_sub_i32 s2, s2, s6 1988; GFX9-NEXT: s_add_i32 s7, s3, 1 1989; GFX9-NEXT: s_sub_i32 s6, s2, s0 1990; GFX9-NEXT: s_cmp_ge_u32 s2, s0 1991; GFX9-NEXT: s_cselect_b32 s3, s7, s3 1992; GFX9-NEXT: s_cselect_b32 s2, s6, s2 1993; GFX9-NEXT: s_add_i32 s6, s3, 1 1994; GFX9-NEXT: s_cmp_ge_u32 s2, s0 1995; GFX9-NEXT: s_cselect_b32 s0, s6, s3 1996; GFX9-NEXT: s_abs_i32 s2, s13 1997; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 1998; GFX9-NEXT: s_xor_b32 s0, s0, s1 1999; GFX9-NEXT: s_sub_i32 s7, 0, s2 2000; GFX9-NEXT: s_sub_i32 s8, s0, s1 2001; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2002; GFX9-NEXT: s_abs_i32 s6, s9 2003; GFX9-NEXT: s_xor_b32 s3, s9, s13 2004; GFX9-NEXT: s_ashr_i32 s3, s3, 31 2005; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2006; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2007; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2008; GFX9-NEXT: s_mul_i32 s7, s7, s0 2009; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7 2010; GFX9-NEXT: s_add_i32 s0, s0, s1 2011; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 2012; GFX9-NEXT: s_mul_i32 s1, s0, s2 2013; GFX9-NEXT: s_sub_i32 s1, s6, s1 2014; GFX9-NEXT: s_add_i32 s7, s0, 1 2015; GFX9-NEXT: s_sub_i32 s6, s1, s2 2016; GFX9-NEXT: s_cmp_ge_u32 s1, s2 2017; GFX9-NEXT: s_cselect_b32 s0, s7, s0 2018; GFX9-NEXT: s_cselect_b32 s1, s6, s1 2019; GFX9-NEXT: s_add_i32 s6, s0, 1 2020; GFX9-NEXT: s_cmp_ge_u32 s1, s2 2021; GFX9-NEXT: s_cselect_b32 s0, s6, s0 2022; GFX9-NEXT: s_abs_i32 s1, s14 2023; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2024; GFX9-NEXT: s_xor_b32 s0, s0, s3 2025; GFX9-NEXT: s_sub_i32 s7, 0, s1 2026; GFX9-NEXT: s_sub_i32 s3, s0, s3 2027; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2028; GFX9-NEXT: s_abs_i32 s6, s10 2029; GFX9-NEXT: s_xor_b32 s2, s10, s14 2030; GFX9-NEXT: s_ashr_i32 s2, s2, 31 2031; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2032; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2033; GFX9-NEXT: v_mov_b32_e32 v1, s3 2034; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2035; GFX9-NEXT: s_mul_i32 s7, s7, s0 2036; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7 2037; GFX9-NEXT: s_add_i32 s0, s0, s7 2038; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 2039; GFX9-NEXT: s_mul_i32 s7, s0, s1 2040; GFX9-NEXT: s_sub_i32 s6, s6, s7 2041; GFX9-NEXT: s_add_i32 s9, s0, 1 2042; GFX9-NEXT: s_sub_i32 s7, s6, s1 2043; GFX9-NEXT: s_cmp_ge_u32 s6, s1 2044; GFX9-NEXT: s_cselect_b32 s0, s9, s0 2045; GFX9-NEXT: s_cselect_b32 s6, s7, s6 2046; GFX9-NEXT: s_add_i32 s7, s0, 1 2047; GFX9-NEXT: s_cmp_ge_u32 s6, s1 2048; GFX9-NEXT: s_cselect_b32 s6, s7, s0 2049; GFX9-NEXT: s_abs_i32 s7, s15 2050; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 2051; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2052; GFX9-NEXT: s_xor_b32 s5, s6, s2 2053; GFX9-NEXT: s_sub_i32 s6, 0, s7 2054; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2055; GFX9-NEXT: s_sub_i32 s2, s5, s2 2056; GFX9-NEXT: s_abs_i32 s4, s11 2057; GFX9-NEXT: s_xor_b32 s3, s11, s15 2058; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2059; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2060; GFX9-NEXT: v_mov_b32_e32 v0, s8 2061; GFX9-NEXT: s_ashr_i32 s3, s3, 31 2062; GFX9-NEXT: v_readfirstlane_b32 s5, v2 2063; GFX9-NEXT: s_mul_i32 s6, s6, s5 2064; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 2065; GFX9-NEXT: s_add_i32 s5, s5, s6 2066; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5 2067; GFX9-NEXT: s_mul_i32 s6, s5, s7 2068; GFX9-NEXT: s_sub_i32 s4, s4, s6 2069; GFX9-NEXT: s_add_i32 s8, s5, 1 2070; GFX9-NEXT: s_sub_i32 s6, s4, s7 2071; GFX9-NEXT: s_cmp_ge_u32 s4, s7 2072; GFX9-NEXT: s_cselect_b32 s5, s8, s5 2073; GFX9-NEXT: s_cselect_b32 s4, s6, s4 2074; GFX9-NEXT: s_add_i32 s6, s5, 1 2075; GFX9-NEXT: s_cmp_ge_u32 s4, s7 2076; GFX9-NEXT: s_cselect_b32 s4, s6, s5 2077; GFX9-NEXT: s_xor_b32 s4, s4, s3 2078; GFX9-NEXT: s_sub_i32 s3, s4, s3 2079; GFX9-NEXT: v_mov_b32_e32 v2, s2 2080; GFX9-NEXT: v_mov_b32_e32 v3, s3 2081; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2082; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2083; GFX9-NEXT: s_endpgm 2084 %r = sdiv <4 x i32> %x, %y 2085 store <4 x i32> %r, ptr addrspace(1) %out 2086 ret void 2087} 2088 2089define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x i32> %y) { 2090; CHECK-LABEL: @srem_v4i32( 2091; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2092; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2093; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2094; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2095; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2096; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2097; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2098; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2099; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2100; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2101; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2102; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2103; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2104; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2105; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2106; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2107; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2108; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2109; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2110; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2111; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2112; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2113; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2114; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2115; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2116; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2117; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2118; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2119; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2120; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2121; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2122; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2123; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2124; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2125; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2126; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2127; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2128; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i64 0 2129; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2130; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2131; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2132; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2133; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2134; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2135; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2136; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2137; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2138; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2139; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2140; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2141; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2142; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2143; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2144; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2145; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2146; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2147; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2148; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2149; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2150; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2151; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2152; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2153; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2154; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2155; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2156; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2157; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2158; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2159; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2160; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2161; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2162; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2163; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2164; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2165; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2166; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2167; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2168; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2169; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2170; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2171; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2172; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2173; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2174; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2175; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2176; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2177; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2178; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2179; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2180; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2181; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2182; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2183; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2184; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2185; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2186; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2187; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2188; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2189; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2190; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2191; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2192; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2193; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2194; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2195; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2196; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2197; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2198; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2199; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2200; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2201; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2202; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2203; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2204; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2205; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2206; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2207; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2208; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2209; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2210; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2211; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2212; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2213; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2214; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2215; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2216; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2217; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2218; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2219; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2220; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2221; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2222; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2223; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2224; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2225; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2226; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2227; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2228; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2229; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2230; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2231; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2232; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2233; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2234; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2235; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2236; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2237; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2238; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2239; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2240; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2241; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2242; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2243; CHECK-NEXT: store <4 x i32> [[TMP152]], ptr addrspace(1) [[OUT:%.*]], align 16 2244; CHECK-NEXT: ret void 2245; 2246; GFX6-LABEL: srem_v4i32: 2247; GFX6: ; %bb.0: 2248; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 2249; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2250; GFX6-NEXT: s_abs_i32 s0, s12 2251; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 2252; GFX6-NEXT: s_sub_i32 s1, 0, s0 2253; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2254; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2255; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2256; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2257; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 2258; GFX6-NEXT: s_abs_i32 s1, s8 2259; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 2260; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2261; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 2262; GFX6-NEXT: v_readfirstlane_b32 s3, v0 2263; GFX6-NEXT: s_mul_i32 s3, s3, s0 2264; GFX6-NEXT: s_sub_i32 s1, s1, s3 2265; GFX6-NEXT: s_sub_i32 s3, s1, s0 2266; GFX6-NEXT: s_cmp_ge_u32 s1, s0 2267; GFX6-NEXT: s_cselect_b32 s1, s3, s1 2268; GFX6-NEXT: s_sub_i32 s3, s1, s0 2269; GFX6-NEXT: s_cmp_ge_u32 s1, s0 2270; GFX6-NEXT: s_cselect_b32 s0, s3, s1 2271; GFX6-NEXT: s_abs_i32 s1, s13 2272; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 2273; GFX6-NEXT: s_sub_i32 s3, 0, s1 2274; GFX6-NEXT: s_xor_b32 s0, s0, s2 2275; GFX6-NEXT: s_sub_i32 s7, s0, s2 2276; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2277; GFX6-NEXT: s_ashr_i32 s6, s9, 31 2278; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2279; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2280; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 2281; GFX6-NEXT: s_abs_i32 s3, s9 2282; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 2283; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2284; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 2285; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2286; GFX6-NEXT: s_mul_i32 s0, s0, s1 2287; GFX6-NEXT: s_sub_i32 s0, s3, s0 2288; GFX6-NEXT: s_sub_i32 s2, s0, s1 2289; GFX6-NEXT: s_cmp_ge_u32 s0, s1 2290; GFX6-NEXT: s_cselect_b32 s0, s2, s0 2291; GFX6-NEXT: s_sub_i32 s2, s0, s1 2292; GFX6-NEXT: s_cmp_ge_u32 s0, s1 2293; GFX6-NEXT: s_cselect_b32 s0, s2, s0 2294; GFX6-NEXT: s_abs_i32 s1, s14 2295; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s1 2296; GFX6-NEXT: s_sub_i32 s2, 0, s1 2297; GFX6-NEXT: s_xor_b32 s0, s0, s6 2298; GFX6-NEXT: s_sub_i32 s6, s0, s6 2299; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2300; GFX6-NEXT: s_ashr_i32 s8, s10, 31 2301; GFX6-NEXT: s_mov_b32 s3, 0xf000 2302; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2303; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2304; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 2305; GFX6-NEXT: s_abs_i32 s2, s10 2306; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 2307; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 2308; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 2309; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2310; GFX6-NEXT: s_mul_i32 s0, s0, s1 2311; GFX6-NEXT: s_sub_i32 s0, s2, s0 2312; GFX6-NEXT: s_sub_i32 s2, s0, s1 2313; GFX6-NEXT: s_cmp_ge_u32 s0, s1 2314; GFX6-NEXT: s_cselect_b32 s0, s2, s0 2315; GFX6-NEXT: s_sub_i32 s2, s0, s1 2316; GFX6-NEXT: s_cmp_ge_u32 s0, s1 2317; GFX6-NEXT: s_cselect_b32 s9, s2, s0 2318; GFX6-NEXT: s_abs_i32 s10, s15 2319; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 2320; GFX6-NEXT: s_sub_i32 s0, 0, s10 2321; GFX6-NEXT: s_mov_b32 s2, -1 2322; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2323; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2324; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 2325; GFX6-NEXT: v_mov_b32_e32 v0, s7 2326; GFX6-NEXT: v_mul_lo_u32 v1, s0, v2 2327; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2328; GFX6-NEXT: s_abs_i32 s4, s11 2329; GFX6-NEXT: s_ashr_i32 s5, s11, 31 2330; GFX6-NEXT: v_mul_hi_u32 v3, v2, v1 2331; GFX6-NEXT: v_mov_b32_e32 v1, s6 2332; GFX6-NEXT: s_xor_b32 s6, s9, s8 2333; GFX6-NEXT: s_sub_i32 s6, s6, s8 2334; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2335; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 2336; GFX6-NEXT: v_readfirstlane_b32 s7, v2 2337; GFX6-NEXT: s_mul_i32 s7, s7, s10 2338; GFX6-NEXT: s_sub_i32 s4, s4, s7 2339; GFX6-NEXT: s_sub_i32 s7, s4, s10 2340; GFX6-NEXT: s_cmp_ge_u32 s4, s10 2341; GFX6-NEXT: s_cselect_b32 s4, s7, s4 2342; GFX6-NEXT: s_sub_i32 s7, s4, s10 2343; GFX6-NEXT: s_cmp_ge_u32 s4, s10 2344; GFX6-NEXT: s_cselect_b32 s4, s7, s4 2345; GFX6-NEXT: s_xor_b32 s4, s4, s5 2346; GFX6-NEXT: s_sub_i32 s4, s4, s5 2347; GFX6-NEXT: v_mov_b32_e32 v2, s6 2348; GFX6-NEXT: v_mov_b32_e32 v3, s4 2349; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2350; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2351; GFX6-NEXT: s_endpgm 2352; 2353; GFX9-LABEL: srem_v4i32: 2354; GFX9: ; %bb.0: 2355; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 2356; GFX9-NEXT: v_mov_b32_e32 v4, 0 2357; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2358; GFX9-NEXT: s_abs_i32 s0, s12 2359; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 2360; GFX9-NEXT: s_sub_i32 s3, 0, s0 2361; GFX9-NEXT: s_abs_i32 s2, s8 2362; GFX9-NEXT: s_ashr_i32 s1, s8, 31 2363; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2364; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2365; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2366; GFX9-NEXT: v_readfirstlane_b32 s6, v0 2367; GFX9-NEXT: s_mul_i32 s3, s3, s6 2368; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 2369; GFX9-NEXT: s_add_i32 s6, s6, s3 2370; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 2371; GFX9-NEXT: s_mul_i32 s3, s3, s0 2372; GFX9-NEXT: s_sub_i32 s2, s2, s3 2373; GFX9-NEXT: s_sub_i32 s3, s2, s0 2374; GFX9-NEXT: s_cmp_ge_u32 s2, s0 2375; GFX9-NEXT: s_cselect_b32 s2, s3, s2 2376; GFX9-NEXT: s_sub_i32 s3, s2, s0 2377; GFX9-NEXT: s_cmp_ge_u32 s2, s0 2378; GFX9-NEXT: s_cselect_b32 s0, s3, s2 2379; GFX9-NEXT: s_abs_i32 s2, s13 2380; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2381; GFX9-NEXT: s_xor_b32 s0, s0, s1 2382; GFX9-NEXT: s_sub_i32 s7, 0, s2 2383; GFX9-NEXT: s_sub_i32 s8, s0, s1 2384; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2385; GFX9-NEXT: s_abs_i32 s6, s9 2386; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2387; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2388; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2389; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2390; GFX9-NEXT: s_mul_i32 s7, s7, s0 2391; GFX9-NEXT: s_mul_hi_u32 s1, s0, s7 2392; GFX9-NEXT: s_add_i32 s0, s0, s1 2393; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 2394; GFX9-NEXT: s_mul_i32 s0, s0, s2 2395; GFX9-NEXT: s_sub_i32 s0, s6, s0 2396; GFX9-NEXT: s_sub_i32 s1, s0, s2 2397; GFX9-NEXT: s_cmp_ge_u32 s0, s2 2398; GFX9-NEXT: s_cselect_b32 s0, s1, s0 2399; GFX9-NEXT: s_sub_i32 s1, s0, s2 2400; GFX9-NEXT: s_cmp_ge_u32 s0, s2 2401; GFX9-NEXT: s_cselect_b32 s0, s1, s0 2402; GFX9-NEXT: s_abs_i32 s1, s14 2403; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2404; GFX9-NEXT: s_xor_b32 s0, s0, s3 2405; GFX9-NEXT: s_sub_i32 s7, 0, s1 2406; GFX9-NEXT: s_sub_i32 s3, s0, s3 2407; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2408; GFX9-NEXT: s_abs_i32 s6, s10 2409; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2410; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2411; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2412; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2413; GFX9-NEXT: s_mul_i32 s7, s7, s0 2414; GFX9-NEXT: s_mul_hi_u32 s7, s0, s7 2415; GFX9-NEXT: s_add_i32 s0, s0, s7 2416; GFX9-NEXT: s_mul_hi_u32 s0, s6, s0 2417; GFX9-NEXT: s_mul_i32 s0, s0, s1 2418; GFX9-NEXT: s_sub_i32 s0, s6, s0 2419; GFX9-NEXT: s_sub_i32 s6, s0, s1 2420; GFX9-NEXT: s_cmp_ge_u32 s0, s1 2421; GFX9-NEXT: s_cselect_b32 s0, s6, s0 2422; GFX9-NEXT: s_sub_i32 s6, s0, s1 2423; GFX9-NEXT: s_cmp_ge_u32 s0, s1 2424; GFX9-NEXT: s_cselect_b32 s6, s6, s0 2425; GFX9-NEXT: s_abs_i32 s7, s15 2426; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 2427; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2428; GFX9-NEXT: s_xor_b32 s5, s6, s2 2429; GFX9-NEXT: s_sub_i32 s6, 0, s7 2430; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1 2431; GFX9-NEXT: s_sub_i32 s2, s5, s2 2432; GFX9-NEXT: s_abs_i32 s4, s11 2433; GFX9-NEXT: v_mov_b32_e32 v1, s3 2434; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2435; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2436; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2437; GFX9-NEXT: v_mov_b32_e32 v0, s8 2438; GFX9-NEXT: v_readfirstlane_b32 s5, v2 2439; GFX9-NEXT: s_mul_i32 s6, s6, s5 2440; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 2441; GFX9-NEXT: s_add_i32 s5, s5, s6 2442; GFX9-NEXT: s_mul_hi_u32 s5, s4, s5 2443; GFX9-NEXT: s_mul_i32 s5, s5, s7 2444; GFX9-NEXT: s_sub_i32 s4, s4, s5 2445; GFX9-NEXT: s_sub_i32 s5, s4, s7 2446; GFX9-NEXT: s_cmp_ge_u32 s4, s7 2447; GFX9-NEXT: s_cselect_b32 s4, s5, s4 2448; GFX9-NEXT: s_sub_i32 s5, s4, s7 2449; GFX9-NEXT: s_cmp_ge_u32 s4, s7 2450; GFX9-NEXT: s_cselect_b32 s4, s5, s4 2451; GFX9-NEXT: s_xor_b32 s4, s4, s3 2452; GFX9-NEXT: s_sub_i32 s3, s4, s3 2453; GFX9-NEXT: v_mov_b32_e32 v2, s2 2454; GFX9-NEXT: v_mov_b32_e32 v3, s3 2455; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2456; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2457; GFX9-NEXT: s_endpgm 2458 %r = srem <4 x i32> %x, %y 2459 store <4 x i32> %r, ptr addrspace(1) %out 2460 ret void 2461} 2462 2463define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) { 2464; CHECK-LABEL: @udiv_v4i16( 2465; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2466; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2467; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2468; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2469; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2470; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2471; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2472; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2473; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2474; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2475; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2476; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2477; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2478; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2479; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2480; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2481; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2482; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2483; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2484; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> poison, i16 [[TMP19]], i64 0 2485; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2486; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2487; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2488; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2489; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2490; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2491; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2492; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2493; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2494; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2495; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2496; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2497; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2498; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2499; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2500; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2501; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2502; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2503; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2504; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2505; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2506; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2507; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2508; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2509; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2510; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2511; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2512; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2513; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2514; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2515; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2516; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2517; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2518; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2519; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2520; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2521; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2522; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2523; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2524; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2525; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2526; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2527; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2528; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2529; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2530; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2531; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2532; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2533; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2534; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2535; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2536; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2537; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2538; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2539; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2540; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2541; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2542; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2543; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2544; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2545; CHECK-NEXT: store <4 x i16> [[TMP80]], ptr addrspace(1) [[OUT:%.*]], align 8 2546; CHECK-NEXT: ret void 2547; 2548; GFX6-LABEL: udiv_v4i16: 2549; GFX6: ; %bb.0: 2550; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 2551; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2552; GFX6-NEXT: s_mov_b32 s3, 0xf000 2553; GFX6-NEXT: s_mov_b32 s2, -1 2554; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2555; GFX6-NEXT: s_and_b32 s5, s10, 0xffff 2556; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 2557; GFX6-NEXT: s_lshr_b32 s5, s10, 16 2558; GFX6-NEXT: s_and_b32 s4, s8, 0xffff 2559; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2560; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 2561; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2562; GFX6-NEXT: s_lshr_b32 s4, s8, 16 2563; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2564; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 2565; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 2566; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2567; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 2568; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2569; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2570; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2571; GFX6-NEXT: s_and_b32 s4, s11, 0xffff 2572; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 2573; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 2574; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2575; GFX6-NEXT: s_and_b32 s4, s9, 0xffff 2576; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2577; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2578; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2579; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 2580; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2581; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2582; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 2583; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2584; GFX6-NEXT: s_lshr_b32 s4, s11, 16 2585; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 2586; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2587; GFX6-NEXT: s_lshr_b32 s4, s9, 16 2588; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 2589; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2590; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2591; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2592; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2593; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2594; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 2595; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2596; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 2597; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 2598; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2599; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2600; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2601; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2602; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2603; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2604; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2605; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2606; GFX6-NEXT: s_endpgm 2607; 2608; GFX9-LABEL: udiv_v4i16: 2609; GFX9: ; %bb.0: 2610; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 2611; GFX9-NEXT: v_mov_b32_e32 v6, 0 2612; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2613; GFX9-NEXT: s_and_b32 s7, s2, 0xffff 2614; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 2615; GFX9-NEXT: s_and_b32 s6, s0, 0xffff 2616; GFX9-NEXT: s_lshr_b32 s2, s2, 16 2617; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 2618; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2619; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 2620; GFX9-NEXT: s_lshr_b32 s0, s0, 16 2621; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 2622; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2623; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2624; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2625; GFX9-NEXT: s_and_b32 s0, s3, 0xffff 2626; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2627; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2628; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 2629; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2630; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2631; GFX9-NEXT: s_and_b32 s0, s1, 0xffff 2632; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2633; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2634; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2635; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2636; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2637; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2638; GFX9-NEXT: s_lshr_b32 s0, s3, 16 2639; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 2640; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2641; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2642; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2643; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2644; GFX9-NEXT: s_lshr_b32 s0, s1, 16 2645; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 2646; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2647; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2648; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2649; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2650; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2651; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2652; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2653; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 2654; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2655; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2656; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2657; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2658; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2659; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2660; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2661; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 2662; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2663; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 2664; GFX9-NEXT: s_endpgm 2665 %r = udiv <4 x i16> %x, %y 2666 store <4 x i16> %r, ptr addrspace(1) %out 2667 ret void 2668} 2669 2670define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) { 2671; CHECK-LABEL: @urem_v4i16( 2672; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2673; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2674; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2675; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2676; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2677; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2678; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2679; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2680; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2681; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2682; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2683; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2684; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2685; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2686; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2687; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2688; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2689; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2690; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2691; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2692; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2693; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> poison, i16 [[TMP21]], i64 0 2694; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2695; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2696; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2697; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2698; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2699; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2700; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2701; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2702; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2703; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2704; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2705; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2706; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2707; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2708; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2709; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2710; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2711; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2712; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2713; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2714; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2715; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2716; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2717; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2718; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2719; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2720; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2721; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2722; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2723; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2724; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2725; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2726; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2727; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2728; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2729; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2730; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2731; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2732; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2733; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2734; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2735; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2736; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2737; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2738; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2739; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2740; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2741; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2742; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2743; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2744; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2745; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2746; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2747; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2748; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2749; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2750; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2751; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2752; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2753; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2754; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2755; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2756; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2757; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2758; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2759; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2760; CHECK-NEXT: store <4 x i16> [[TMP88]], ptr addrspace(1) [[OUT:%.*]], align 8 2761; CHECK-NEXT: ret void 2762; 2763; GFX6-LABEL: urem_v4i16: 2764; GFX6: ; %bb.0: 2765; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 2766; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2767; GFX6-NEXT: s_mov_b32 s3, 0xf000 2768; GFX6-NEXT: s_mov_b32 s2, -1 2769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2770; GFX6-NEXT: s_and_b32 s5, s10, 0xffff 2771; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 2772; GFX6-NEXT: s_lshr_b32 s5, s10, 16 2773; GFX6-NEXT: s_and_b32 s4, s8, 0xffff 2774; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2775; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 2776; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2777; GFX6-NEXT: s_lshr_b32 s4, s8, 16 2778; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2779; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 2780; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 2781; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2782; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 2783; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 2784; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2785; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2786; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2787; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 2788; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2789; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 2790; GFX6-NEXT: s_and_b32 s6, s11, 0xffff 2791; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 2792; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 2793; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 2794; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 2795; GFX6-NEXT: s_and_b32 s5, s9, 0xffff 2796; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 2797; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 2798; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 2799; GFX6-NEXT: s_lshr_b32 s4, s11, 16 2800; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 2801; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2802; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 2803; GFX6-NEXT: s_lshr_b32 s5, s9, 16 2804; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 2805; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 2806; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2807; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 2808; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 2809; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2810; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2811; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 2812; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2813; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 2814; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2815; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 2816; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2817; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2818; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 2819; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 2820; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2821; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 2822; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 2823; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2824; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2825; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 2826; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2827; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2828; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2829; GFX6-NEXT: s_endpgm 2830; 2831; GFX9-LABEL: urem_v4i16: 2832; GFX9: ; %bb.0: 2833; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 2834; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 2835; GFX9-NEXT: v_mov_b32_e32 v6, 0 2836; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2837; GFX9-NEXT: s_and_b32 s9, s2, 0xffff 2838; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 2839; GFX9-NEXT: s_and_b32 s8, s0, 0xffff 2840; GFX9-NEXT: s_lshr_b32 s2, s2, 16 2841; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 2842; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 2843; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2844; GFX9-NEXT: s_lshr_b32 s0, s0, 16 2845; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 2846; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2847; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2848; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2849; GFX9-NEXT: s_and_b32 s4, s3, 0xffff 2850; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2851; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2852; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 2853; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2854; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2855; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2856; GFX9-NEXT: s_and_b32 s5, s1, 0xffff 2857; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2858; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2859; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2860; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 2861; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2862; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2863; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 2864; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 2865; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 2866; GFX9-NEXT: v_trunc_f32_e32 v2, v2 2867; GFX9-NEXT: s_lshr_b32 s2, s3, 16 2868; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 2869; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 2870; GFX9-NEXT: s_lshr_b32 s1, s1, 16 2871; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 2872; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2873; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2874; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2875; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 2876; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2877; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2878; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2879; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2880; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2881; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2882; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 2883; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2884; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 2885; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 2886; GFX9-NEXT: v_sub_u32_e32 v4, s0, v1 2887; GFX9-NEXT: v_sub_u32_e32 v1, s5, v2 2888; GFX9-NEXT: v_sub_u32_e32 v2, s1, v3 2889; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2890; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2891; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 2892; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 2893; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7] 2894; GFX9-NEXT: s_endpgm 2895 %r = urem <4 x i16> %x, %y 2896 store <4 x i16> %r, ptr addrspace(1) %out 2897 ret void 2898} 2899 2900define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) { 2901; CHECK-LABEL: @sdiv_v4i16( 2902; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2903; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2904; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2905; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2906; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2907; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2908; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2909; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2910; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2911; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2912; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2913; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2914; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2915; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2916; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2917; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2918; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2919; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2920; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2921; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2922; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2923; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2924; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2925; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0 2926; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2927; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2928; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2929; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2930; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2931; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2932; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2933; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2934; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2935; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2936; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2937; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2938; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2939; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2940; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2941; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2942; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2943; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2944; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2945; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2946; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2947; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2948; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2949; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2950; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2951; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2952; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2953; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2954; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2955; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2956; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2957; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2958; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2959; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2960; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2961; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2962; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2963; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2964; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2965; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2966; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2967; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2968; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2969; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2970; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2971; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2972; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2973; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2974; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2975; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2976; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2977; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2978; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2979; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2980; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2981; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2982; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2983; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2984; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2985; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2986; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2987; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2988; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2989; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2990; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2991; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2992; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2993; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2994; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2995; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2996; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2997; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2998; CHECK-NEXT: store <4 x i16> [[TMP96]], ptr addrspace(1) [[OUT:%.*]], align 8 2999; CHECK-NEXT: ret void 3000; 3001; GFX6-LABEL: sdiv_v4i16: 3002; GFX6: ; %bb.0: 3003; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 3004; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3005; GFX6-NEXT: s_mov_b32 s3, 0xf000 3006; GFX6-NEXT: s_mov_b32 s2, -1 3007; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3008; GFX6-NEXT: s_sext_i32_i16 s4, s10 3009; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 3010; GFX6-NEXT: s_sext_i32_i16 s5, s8 3011; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3012; GFX6-NEXT: s_xor_b32 s4, s5, s4 3013; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3014; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3015; GFX6-NEXT: s_or_b32 s6, s4, 1 3016; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3017; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3018; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3019; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3020; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3021; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3022; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3023; GFX6-NEXT: s_ashr_i32 s5, s10, 16 3024; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 3025; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2 3026; GFX6-NEXT: s_ashr_i32 s4, s8, 16 3027; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 3028; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 3029; GFX6-NEXT: s_xor_b32 s4, s4, s5 3030; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3031; GFX6-NEXT: s_or_b32 s6, s4, 1 3032; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 3033; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3034; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 3035; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3036; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3037; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3038; GFX6-NEXT: s_sext_i32_i16 s5, s11 3039; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 3040; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3041; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 3042; GFX6-NEXT: s_sext_i32_i16 s4, s9 3043; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 3044; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 3045; GFX6-NEXT: s_xor_b32 s4, s4, s5 3046; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3047; GFX6-NEXT: s_or_b32 s6, s4, 1 3048; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3049; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3050; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 3051; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3052; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3053; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3054; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3055; GFX6-NEXT: s_ashr_i32 s5, s11, 16 3056; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 3057; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 3058; GFX6-NEXT: s_ashr_i32 s4, s9, 16 3059; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 3060; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v0 3061; GFX6-NEXT: s_xor_b32 s4, s4, s5 3062; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3063; GFX6-NEXT: s_or_b32 s6, s4, 1 3064; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3065; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3066; GFX6-NEXT: v_mad_f32 v4, -v5, v0, v4 3067; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3068; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v0| 3069; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3070; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3071; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v5 3072; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 3073; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3074; GFX6-NEXT: v_or_b32_e32 v1, v1, v0 3075; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v3 3076; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 3077; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 3078; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3079; GFX6-NEXT: s_endpgm 3080; 3081; GFX9-LABEL: sdiv_v4i16: 3082; GFX9: ; %bb.0: 3083; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 3084; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 3085; GFX9-NEXT: v_mov_b32_e32 v2, 0 3086; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3087; GFX9-NEXT: s_sext_i32_i16 s4, s2 3088; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 3089; GFX9-NEXT: s_sext_i32_i16 s5, s0 3090; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 3091; GFX9-NEXT: s_xor_b32 s4, s5, s4 3092; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3093; GFX9-NEXT: s_ashr_i32 s4, s4, 30 3094; GFX9-NEXT: s_or_b32 s8, s4, 1 3095; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3096; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3097; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3098; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3099; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3100; GFX9-NEXT: s_cselect_b32 s4, s8, 0 3101; GFX9-NEXT: s_ashr_i32 s2, s2, 16 3102; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3103; GFX9-NEXT: s_ashr_i32 s0, s0, 16 3104; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3105; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3106; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3107; GFX9-NEXT: s_xor_b32 s0, s0, s2 3108; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3109; GFX9-NEXT: s_sext_i32_i16 s2, s3 3110; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3111; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3112; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3113; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3114; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 3115; GFX9-NEXT: s_or_b32 s0, s0, 1 3116; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3117; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3118; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3119; GFX9-NEXT: s_cselect_b32 s0, s0, 0 3120; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3121; GFX9-NEXT: s_sext_i32_i16 s0, s1 3122; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3123; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3124; GFX9-NEXT: s_xor_b32 s0, s0, s2 3125; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3126; GFX9-NEXT: s_or_b32 s0, s0, 1 3127; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3128; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3129; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3130; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3131; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3132; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3133; GFX9-NEXT: s_cselect_b32 s0, s0, 0 3134; GFX9-NEXT: s_ashr_i32 s2, s3, 16 3135; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3136; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3137; GFX9-NEXT: s_ashr_i32 s0, s1, 16 3138; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3139; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3140; GFX9-NEXT: s_xor_b32 s0, s0, s2 3141; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3142; GFX9-NEXT: s_or_b32 s2, s0, 1 3143; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3144; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3145; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3146; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3147; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3148; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3149; GFX9-NEXT: s_cselect_b32 s0, s2, 0 3150; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3151; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3152; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3153; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 3154; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3155; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 3156; GFX9-NEXT: s_endpgm 3157 %r = sdiv <4 x i16> %x, %y 3158 store <4 x i16> %r, ptr addrspace(1) %out 3159 ret void 3160} 3161 3162define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x i16> %y) { 3163; CHECK-LABEL: @srem_v4i16( 3164; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3165; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3166; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3167; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3168; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3169; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3170; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3171; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3172; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3173; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3174; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3175; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3176; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3177; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3178; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3179; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3180; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3181; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3182; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3183; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3184; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3185; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3186; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3187; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3188; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3189; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0 3190; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 3191; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3192; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3193; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3194; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3195; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3196; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3197; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3198; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3199; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3200; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3201; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3202; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3203; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3204; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3205; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3206; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3207; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3208; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3209; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3210; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3211; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3212; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3213; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3214; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3215; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3216; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 3217; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3218; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3219; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3220; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3221; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3222; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3223; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3224; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3225; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3226; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3227; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3228; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3229; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3230; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3231; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3232; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3233; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3234; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3235; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3236; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3237; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3238; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3239; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3240; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3241; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3242; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 3243; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3244; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 3245; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 3246; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 3247; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 3248; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 3249; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 3250; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 3251; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 3252; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 3253; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 3254; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 3255; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 3256; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 3257; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 3258; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 3259; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 3260; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 3261; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 3262; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 3263; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 3264; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 3265; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 3266; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 3267; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 3268; CHECK-NEXT: store <4 x i16> [[TMP104]], ptr addrspace(1) [[OUT:%.*]], align 8 3269; CHECK-NEXT: ret void 3270; 3271; GFX6-LABEL: srem_v4i16: 3272; GFX6: ; %bb.0: 3273; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 3274; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3275; GFX6-NEXT: s_mov_b32 s3, 0xf000 3276; GFX6-NEXT: s_mov_b32 s2, -1 3277; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3278; GFX6-NEXT: s_sext_i32_i16 s4, s10 3279; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 3280; GFX6-NEXT: s_sext_i32_i16 s5, s8 3281; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3282; GFX6-NEXT: s_xor_b32 s4, s5, s4 3283; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3284; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3285; GFX6-NEXT: s_or_b32 s6, s4, 1 3286; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3287; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3288; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3289; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3290; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3291; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3292; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3293; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 3294; GFX6-NEXT: s_ashr_i32 s4, s10, 16 3295; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 3296; GFX6-NEXT: s_ashr_i32 s5, s8, 16 3297; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 3298; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 3299; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3300; GFX6-NEXT: s_xor_b32 s4, s5, s4 3301; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3302; GFX6-NEXT: s_lshr_b32 s6, s8, 16 3303; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3304; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3305; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3306; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3307; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 3308; GFX6-NEXT: s_lshr_b32 s7, s10, 16 3309; GFX6-NEXT: s_or_b32 s8, s4, 1 3310; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| 3311; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3312; GFX6-NEXT: s_cselect_b32 s4, s8, 0 3313; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 3314; GFX6-NEXT: s_sext_i32_i16 s4, s11 3315; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 3316; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3317; GFX6-NEXT: s_sext_i32_i16 s5, s9 3318; GFX6-NEXT: s_xor_b32 s4, s5, s4 3319; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 3320; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3321; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3322; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3323; GFX6-NEXT: s_or_b32 s6, s4, 1 3324; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3325; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3326; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3327; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3328; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3329; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v2| 3330; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3331; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3332; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 3333; GFX6-NEXT: s_ashr_i32 s4, s11, 16 3334; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3335; GFX6-NEXT: s_ashr_i32 s5, s9, 16 3336; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 3337; GFX6-NEXT: s_xor_b32 s4, s5, s4 3338; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3339; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3340; GFX6-NEXT: s_lshr_b32 s6, s9, 16 3341; GFX6-NEXT: s_lshr_b32 s7, s11, 16 3342; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3343; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3344; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3345; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3346; GFX6-NEXT: s_or_b32 s8, s4, 1 3347; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| 3348; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3349; GFX6-NEXT: s_cselect_b32 s4, s8, 0 3350; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 3351; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 3352; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 3353; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 3354; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 3355; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3356; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3357; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3358; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3359; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3360; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3361; GFX6-NEXT: s_endpgm 3362; 3363; GFX9-LABEL: srem_v4i16: 3364; GFX9: ; %bb.0: 3365; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 3366; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 3367; GFX9-NEXT: v_mov_b32_e32 v2, 0 3368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3369; GFX9-NEXT: s_sext_i32_i16 s8, s2 3370; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 3371; GFX9-NEXT: s_sext_i32_i16 s9, s0 3372; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 3373; GFX9-NEXT: s_xor_b32 s4, s9, s8 3374; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3375; GFX9-NEXT: s_ashr_i32 s4, s4, 30 3376; GFX9-NEXT: s_or_b32 s10, s4, 1 3377; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3378; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3379; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3380; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3381; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3382; GFX9-NEXT: s_cselect_b32 s4, s10, 0 3383; GFX9-NEXT: s_ashr_i32 s10, s0, 16 3384; GFX9-NEXT: s_ashr_i32 s0, s2, 16 3385; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3386; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3387; GFX9-NEXT: s_xor_b32 s2, s10, s0 3388; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3389; GFX9-NEXT: v_add_u32_e32 v1, s4, v3 3390; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s10 3391; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3392; GFX9-NEXT: s_or_b32 s2, s2, 1 3393; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 3394; GFX9-NEXT: s_sext_i32_i16 s8, s1 3395; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 3396; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3397; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 3398; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3399; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| 3400; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3401; GFX9-NEXT: s_cselect_b32 s2, s2, 0 3402; GFX9-NEXT: v_add_u32_e32 v0, s2, v4 3403; GFX9-NEXT: s_sext_i32_i16 s2, s3 3404; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 3405; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s8 3406; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 3407; GFX9-NEXT: s_xor_b32 s0, s8, s2 3408; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 3409; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3410; GFX9-NEXT: s_or_b32 s0, s0, 1 3411; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 3412; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 3413; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3414; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 3415; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| 3416; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3417; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 3418; GFX9-NEXT: s_cselect_b32 s0, s0, 0 3419; GFX9-NEXT: s_ashr_i32 s3, s3, 16 3420; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 3421; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 3422; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 3423; GFX9-NEXT: s_ashr_i32 s2, s1, 16 3424; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 3425; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3426; GFX9-NEXT: s_xor_b32 s0, s2, s3 3427; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3428; GFX9-NEXT: s_or_b32 s4, s0, 1 3429; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3430; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3431; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 3432; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3433; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 3434; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3435; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3436; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 3437; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 3438; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 3439; GFX9-NEXT: v_sub_u32_e32 v1, s8, v3 3440; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3441; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 3442; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3443; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 3444; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 3445; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 3446; GFX9-NEXT: s_endpgm 3447 %r = srem <4 x i16> %x, %y 3448 store <4 x i16> %r, ptr addrspace(1) %out 3449 ret void 3450} 3451 3452define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { 3453; CHECK-LABEL: @udiv_i3( 3454; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3455; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3456; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3457; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3458; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3459; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3460; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3461; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3462; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3463; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3464; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3465; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3466; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3467; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3468; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3469; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 3470; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 3471; CHECK-NEXT: store i3 [[TMP17]], ptr addrspace(1) [[OUT:%.*]], align 1 3472; CHECK-NEXT: ret void 3473; 3474; GFX6-LABEL: udiv_i3: 3475; GFX6: ; %bb.0: 3476; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 3477; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3478; GFX6-NEXT: s_mov_b32 s3, 0xf000 3479; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3480; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 3481; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3482; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3483; GFX6-NEXT: s_and_b32 s4, s6, 7 3484; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3485; GFX6-NEXT: s_mov_b32 s2, -1 3486; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3487; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3488; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3489; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3490; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3491; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3492; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3493; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3494; GFX6-NEXT: s_endpgm 3495; 3496; GFX9-LABEL: udiv_i3: 3497; GFX9: ; %bb.0: 3498; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 3499; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3500; GFX9-NEXT: v_mov_b32_e32 v2, 0 3501; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3502; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 3503; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 3504; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3505; GFX9-NEXT: s_and_b32 s2, s2, 7 3506; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 3507; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 3508; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3509; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 3510; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 3511; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3512; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 3513; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3514; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 3515; GFX9-NEXT: s_endpgm 3516 %r = udiv i3 %x, %y 3517 store i3 %r, ptr addrspace(1) %out 3518 ret void 3519} 3520 3521define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { 3522; CHECK-LABEL: @urem_i3( 3523; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3524; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3525; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3526; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3527; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3528; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3529; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3530; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3531; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3532; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3533; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3534; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3535; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3536; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3537; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3538; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 3539; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 3540; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 3541; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 3542; CHECK-NEXT: store i3 [[TMP19]], ptr addrspace(1) [[OUT:%.*]], align 1 3543; CHECK-NEXT: ret void 3544; 3545; GFX6-LABEL: urem_i3: 3546; GFX6: ; %bb.0: 3547; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 3548; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3549; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3550; GFX6-NEXT: s_bfe_u32 s2, s6, 0x30008 3551; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3552; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3553; GFX6-NEXT: s_and_b32 s3, s6, 7 3554; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 3555; GFX6-NEXT: s_lshr_b32 s2, s6, 8 3556; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3557; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3558; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3559; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3560; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3561; GFX6-NEXT: s_mov_b32 s3, 0xf000 3562; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3563; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3564; GFX6-NEXT: s_mov_b32 s2, -1 3565; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 3566; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3567; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3568; GFX6-NEXT: s_endpgm 3569; 3570; GFX9-LABEL: urem_i3: 3571; GFX9: ; %bb.0: 3572; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 3573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3574; GFX9-NEXT: s_bfe_u32 s0, s2, 0x30008 3575; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 3576; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3577; GFX9-NEXT: s_and_b32 s1, s2, 7 3578; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s1 3579; GFX9-NEXT: s_lshr_b32 s0, s2, 8 3580; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 3581; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3582; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 3583; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 3584; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3585; GFX9-NEXT: v_mov_b32_e32 v1, 0 3586; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3587; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 3588; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3589; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3590; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3591; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3592; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3593; GFX9-NEXT: s_endpgm 3594 %r = urem i3 %x, %y 3595 store i3 %r, ptr addrspace(1) %out 3596 ret void 3597} 3598 3599define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { 3600; CHECK-LABEL: @sdiv_i3( 3601; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3602; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3603; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3604; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3605; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3606; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3607; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3608; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3609; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3610; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3611; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3612; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3613; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3614; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3615; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3616; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3617; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3618; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3619; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 3620; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 3621; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 3622; CHECK-NEXT: store i3 [[TMP21]], ptr addrspace(1) [[OUT:%.*]], align 1 3623; CHECK-NEXT: ret void 3624; 3625; GFX6-LABEL: sdiv_i3: 3626; GFX6: ; %bb.0: 3627; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 3628; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3629; GFX6-NEXT: s_mov_b32 s3, 0xf000 3630; GFX6-NEXT: s_mov_b32 s2, -1 3631; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3632; GFX6-NEXT: s_bfe_i32 s4, s6, 0x30008 3633; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 3634; GFX6-NEXT: s_bfe_i32 s5, s6, 0x30000 3635; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3636; GFX6-NEXT: s_xor_b32 s4, s5, s4 3637; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3638; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3639; GFX6-NEXT: s_or_b32 s6, s4, 1 3640; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3641; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3642; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3643; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3644; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 3645; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 3646; GFX6-NEXT: s_cselect_b32 s4, s6, 0 3647; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 3648; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3649; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3650; GFX6-NEXT: s_endpgm 3651; 3652; GFX9-LABEL: sdiv_i3: 3653; GFX9: ; %bb.0: 3654; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 3655; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3656; GFX9-NEXT: v_mov_b32_e32 v1, 0 3657; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3658; GFX9-NEXT: s_bfe_i32 s3, s2, 0x30008 3659; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 3660; GFX9-NEXT: s_bfe_i32 s2, s2, 0x30000 3661; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 3662; GFX9-NEXT: s_xor_b32 s2, s2, s3 3663; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3664; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3665; GFX9-NEXT: s_or_b32 s4, s2, 1 3666; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3667; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3668; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3669; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3670; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| 3671; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 3672; GFX9-NEXT: s_cselect_b32 s2, s4, 0 3673; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 3674; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3675; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3676; GFX9-NEXT: s_endpgm 3677 %r = sdiv i3 %x, %y 3678 store i3 %r, ptr addrspace(1) %out 3679 ret void 3680} 3681 3682define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) { 3683; CHECK-LABEL: @srem_i3( 3684; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3685; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3686; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3687; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3688; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3689; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3690; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3691; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3692; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3693; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3694; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3695; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3696; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3697; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3698; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3699; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3700; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3701; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3702; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 3703; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 3704; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 3705; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 3706; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 3707; CHECK-NEXT: store i3 [[TMP23]], ptr addrspace(1) [[OUT:%.*]], align 1 3708; CHECK-NEXT: ret void 3709; 3710; GFX6-LABEL: srem_i3: 3711; GFX6: ; %bb.0: 3712; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 3713; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3714; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3715; GFX6-NEXT: s_bfe_i32 s2, s6, 0x30008 3716; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 3717; GFX6-NEXT: s_bfe_i32 s3, s6, 0x30000 3718; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 3719; GFX6-NEXT: s_xor_b32 s2, s3, s2 3720; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3721; GFX6-NEXT: s_ashr_i32 s2, s2, 30 3722; GFX6-NEXT: s_lshr_b32 s4, s6, 8 3723; GFX6-NEXT: s_or_b32 s5, s2, 1 3724; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3725; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3726; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3727; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3728; GFX6-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 3729; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], exec 3730; GFX6-NEXT: s_cselect_b32 s2, s5, 0 3731; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 3732; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 3733; GFX6-NEXT: s_mov_b32 s3, 0xf000 3734; GFX6-NEXT: s_mov_b32 s2, -1 3735; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 3736; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3737; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3738; GFX6-NEXT: s_endpgm 3739; 3740; GFX9-LABEL: srem_i3: 3741; GFX9: ; %bb.0: 3742; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 3743; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3744; GFX9-NEXT: s_bfe_i32 s0, s2, 0x30008 3745; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3746; GFX9-NEXT: s_bfe_i32 s1, s2, 0x30000 3747; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3748; GFX9-NEXT: s_xor_b32 s0, s1, s0 3749; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 3750; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3751; GFX9-NEXT: s_lshr_b32 s3, s2, 8 3752; GFX9-NEXT: s_or_b32 s6, s0, 1 3753; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 3754; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3755; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 3756; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 3757; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3758; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3759; GFX9-NEXT: s_cselect_b32 s0, s6, 0 3760; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 3761; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 3762; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 3763; GFX9-NEXT: v_mov_b32_e32 v1, 0 3764; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3765; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3766; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3767; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3768; GFX9-NEXT: s_endpgm 3769 %r = srem i3 %x, %y 3770 store i3 %r, ptr addrspace(1) %out 3771 ret void 3772} 3773 3774define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) { 3775; CHECK-LABEL: @udiv_v3i16( 3776; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3777; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3778; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3779; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3780; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3781; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3782; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3783; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3784; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3785; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3786; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3787; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3788; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3789; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3790; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3791; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3792; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3793; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3794; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3795; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> poison, i16 [[TMP19]], i64 0 3796; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 3797; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3798; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3799; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3800; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3801; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3802; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3803; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3804; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3805; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3806; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3807; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3808; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3809; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3810; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3811; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3812; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3813; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3814; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3815; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3816; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 3817; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3818; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3819; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3820; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3821; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3822; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3823; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3824; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3825; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3826; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3827; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3828; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3829; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3830; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3831; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3832; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3833; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3834; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3835; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3836; CHECK-NEXT: store <3 x i16> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8 3837; CHECK-NEXT: ret void 3838; 3839; GFX6-LABEL: udiv_v3i16: 3840; GFX6: ; %bb.0: 3841; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 3842; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 3843; GFX6-NEXT: s_mov_b32 s3, 0xf000 3844; GFX6-NEXT: s_mov_b32 s2, -1 3845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3846; GFX6-NEXT: s_and_b32 s5, s10, 0xffff 3847; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 3848; GFX6-NEXT: s_lshr_b32 s5, s10, 16 3849; GFX6-NEXT: s_and_b32 s4, s8, 0xffff 3850; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 3851; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 3852; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 3853; GFX6-NEXT: s_lshr_b32 s4, s8, 16 3854; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3855; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3856; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 3857; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3858; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 3859; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3860; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3861; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3862; GFX6-NEXT: s_and_b32 s4, s11, 0xffff 3863; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 3864; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 3865; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3866; GFX6-NEXT: s_and_b32 s4, s9, 0xffff 3867; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 3868; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 3869; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3870; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3871; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3872; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3873; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 3874; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3875; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3876; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3877; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 3878; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3879; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3880; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3881; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3882; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 3883; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3884; GFX6-NEXT: s_endpgm 3885; 3886; GFX9-LABEL: udiv_v3i16: 3887; GFX9: ; %bb.0: 3888; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 3889; GFX9-NEXT: v_mov_b32_e32 v6, 0 3890; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3891; GFX9-NEXT: s_and_b32 s7, s2, 0xffff 3892; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 3893; GFX9-NEXT: s_and_b32 s6, s0, 0xffff 3894; GFX9-NEXT: s_lshr_b32 s2, s2, 16 3895; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 3896; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 3897; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3898; GFX9-NEXT: s_lshr_b32 s0, s0, 16 3899; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 3900; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 3901; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 3902; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3903; GFX9-NEXT: s_and_b32 s0, s3, 0xffff 3904; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 3905; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 3906; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 3907; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 3908; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3909; GFX9-NEXT: v_trunc_f32_e32 v2, v5 3910; GFX9-NEXT: s_and_b32 s0, s1, 0xffff 3911; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 3912; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 3913; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 3914; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3915; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 3916; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3917; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 3918; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 3919; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3920; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 3921; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 3922; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 3923; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3924; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3925; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 3926; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 3927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3928; GFX9-NEXT: global_store_short v6, v2, s[6:7] offset:4 3929; GFX9-NEXT: global_store_dword v6, v0, s[6:7] 3930; GFX9-NEXT: s_endpgm 3931 %r = udiv <3 x i16> %x, %y 3932 store <3 x i16> %r, ptr addrspace(1) %out 3933 ret void 3934} 3935 3936define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) { 3937; CHECK-LABEL: @urem_v3i16( 3938; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3939; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3940; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3941; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3942; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3943; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3944; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3945; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3946; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3947; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3948; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3949; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3950; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3951; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3952; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3953; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3954; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3955; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3956; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3957; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3958; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3959; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> poison, i16 [[TMP21]], i64 0 3960; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3961; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3962; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3963; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3964; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3965; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3966; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3967; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3968; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3969; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3970; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3971; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3972; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3973; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3974; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3975; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3976; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3977; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3978; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3979; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3980; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3981; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3982; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3983; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3984; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3985; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3986; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3987; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3988; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3989; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3990; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3991; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3992; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3993; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3994; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3995; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3996; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3997; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3998; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3999; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4000; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4001; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 4002; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 4003; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 4004; CHECK-NEXT: store <3 x i16> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8 4005; CHECK-NEXT: ret void 4006; 4007; GFX6-LABEL: urem_v3i16: 4008; GFX6: ; %bb.0: 4009; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 4010; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4011; GFX6-NEXT: s_mov_b32 s3, 0xf000 4012; GFX6-NEXT: s_mov_b32 s2, -1 4013; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4014; GFX6-NEXT: s_and_b32 s5, s10, 0xffff 4015; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 4016; GFX6-NEXT: s_lshr_b32 s5, s10, 16 4017; GFX6-NEXT: s_and_b32 s4, s8, 0xffff 4018; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 4019; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 4020; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 4021; GFX6-NEXT: s_lshr_b32 s4, s8, 16 4022; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 4023; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 4024; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 4025; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4026; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 4027; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 4028; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4029; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 4030; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4031; GFX6-NEXT: s_and_b32 s6, s11, 0xffff 4032; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 4033; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 4034; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 4035; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 4036; GFX6-NEXT: s_and_b32 s6, s9, 0xffff 4037; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 4038; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 4039; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 4040; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4041; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 4042; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 4043; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4044; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 4045; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4046; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 4047; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4048; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 4049; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 4050; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 4051; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 4052; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4053; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 4054; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4055; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4056; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4057; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4058; GFX6-NEXT: s_endpgm 4059; 4060; GFX9-LABEL: urem_v3i16: 4061; GFX9: ; %bb.0: 4062; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 4063; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 4064; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4065; GFX9-NEXT: s_and_b32 s9, s2, 0xffff 4066; GFX9-NEXT: s_lshr_b32 s2, s2, 16 4067; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 4068; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 4069; GFX9-NEXT: s_and_b32 s8, s0, 0xffff 4070; GFX9-NEXT: s_lshr_b32 s0, s0, 16 4071; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 4072; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4073; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0 4074; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4075; GFX9-NEXT: s_and_b32 s3, s3, 0xffff 4076; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 4077; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4078; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 4079; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4080; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 4081; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 4082; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4083; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 4084; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 4085; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 4086; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4087; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 4088; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 4089; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 4090; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4091; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4092; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4093; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4094; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 4095; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 4096; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4097; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 4098; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc 4099; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 4100; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 4101; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 4102; GFX9-NEXT: v_mov_b32_e32 v3, 0 4103; GFX9-NEXT: v_sub_u32_e32 v1, s0, v1 4104; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2 4105; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 4106; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 4107; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 4108; GFX9-NEXT: global_store_dword v3, v0, s[6:7] 4109; GFX9-NEXT: s_endpgm 4110 %r = urem <3 x i16> %x, %y 4111 store <3 x i16> %r, ptr addrspace(1) %out 4112 ret void 4113} 4114 4115define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) { 4116; CHECK-LABEL: @sdiv_v3i16( 4117; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4118; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4119; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4120; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4121; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4122; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4123; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4124; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4125; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4126; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4127; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4128; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4129; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4130; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4131; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4132; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4133; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4134; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4135; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4136; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4137; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 4138; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 4139; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 4140; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> poison, i16 [[TMP23]], i64 0 4141; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 4142; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4143; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 4144; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 4145; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4146; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4147; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4148; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4149; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4150; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4151; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4152; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4153; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4154; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4155; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4156; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4157; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4158; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4159; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4160; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4161; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 4162; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 4163; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 4164; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 4165; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 4166; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4167; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 4168; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 4169; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4170; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4171; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4172; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4173; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4174; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4175; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4176; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4177; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4178; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4179; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4180; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4181; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4182; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4183; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4184; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4185; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 4186; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 4187; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 4188; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 4189; CHECK-NEXT: store <3 x i16> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8 4190; CHECK-NEXT: ret void 4191; 4192; GFX6-LABEL: sdiv_v3i16: 4193; GFX6: ; %bb.0: 4194; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 4195; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4196; GFX6-NEXT: s_mov_b32 s3, 0xf000 4197; GFX6-NEXT: s_mov_b32 s2, -1 4198; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4199; GFX6-NEXT: s_sext_i32_i16 s4, s10 4200; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 4201; GFX6-NEXT: s_sext_i32_i16 s5, s8 4202; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 4203; GFX6-NEXT: s_xor_b32 s4, s5, s4 4204; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4205; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4206; GFX6-NEXT: s_or_b32 s6, s4, 1 4207; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4208; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4209; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4210; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 4211; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4212; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4213; GFX6-NEXT: s_cselect_b32 s4, s6, 0 4214; GFX6-NEXT: s_ashr_i32 s5, s10, 16 4215; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 4216; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v2 4217; GFX6-NEXT: s_ashr_i32 s4, s8, 16 4218; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4219; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 4220; GFX6-NEXT: s_xor_b32 s4, s4, s5 4221; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4222; GFX6-NEXT: s_or_b32 s6, s4, 1 4223; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4224; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4225; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 4226; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| 4227; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4228; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4229; GFX6-NEXT: s_sext_i32_i16 s5, s11 4230; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 4231; GFX6-NEXT: s_cselect_b32 s4, s6, 0 4232; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v3 4233; GFX6-NEXT: s_sext_i32_i16 s4, s9 4234; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 4235; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 4236; GFX6-NEXT: s_xor_b32 s4, s4, s5 4237; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4238; GFX6-NEXT: s_or_b32 s6, s4, 1 4239; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4240; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4241; GFX6-NEXT: v_mad_f32 v3, -v4, v0, v3 4242; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4243; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| 4244; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4245; GFX6-NEXT: s_cselect_b32 s4, s6, 0 4246; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 4247; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4248; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4249; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 4250; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 4251; GFX6-NEXT: buffer_store_dword v1, off, s[0:3], 0 4252; GFX6-NEXT: s_endpgm 4253; 4254; GFX9-LABEL: sdiv_v3i16: 4255; GFX9: ; %bb.0: 4256; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 4257; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 4258; GFX9-NEXT: v_mov_b32_e32 v1, 0 4259; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4260; GFX9-NEXT: s_sext_i32_i16 s4, s2 4261; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 4262; GFX9-NEXT: s_sext_i32_i16 s5, s0 4263; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 4264; GFX9-NEXT: s_xor_b32 s4, s5, s4 4265; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4266; GFX9-NEXT: s_ashr_i32 s4, s4, 30 4267; GFX9-NEXT: s_or_b32 s8, s4, 1 4268; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4269; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4270; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4271; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| 4272; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 4273; GFX9-NEXT: s_cselect_b32 s4, s8, 0 4274; GFX9-NEXT: s_ashr_i32 s2, s2, 16 4275; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4276; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4277; GFX9-NEXT: s_ashr_i32 s0, s0, 16 4278; GFX9-NEXT: v_add_u32_e32 v2, s4, v3 4279; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 4280; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4281; GFX9-NEXT: s_xor_b32 s0, s0, s2 4282; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4283; GFX9-NEXT: s_sext_i32_i16 s2, s3 4284; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4285; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4286; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4287; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4288; GFX9-NEXT: s_or_b32 s0, s0, 1 4289; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v0| 4290; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4291; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 4292; GFX9-NEXT: s_cselect_b32 s0, s0, 0 4293; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 4294; GFX9-NEXT: s_sext_i32_i16 s0, s1 4295; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 4296; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 4297; GFX9-NEXT: s_xor_b32 s0, s0, s2 4298; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4299; GFX9-NEXT: s_or_b32 s2, s0, 1 4300; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4301; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4302; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 4303; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4304; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 4305; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4306; GFX9-NEXT: s_cselect_b32 s0, s2, 0 4307; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 4308; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 4309; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 4310; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 4311; GFX9-NEXT: global_store_dword v1, v2, s[6:7] 4312; GFX9-NEXT: s_endpgm 4313 %r = sdiv <3 x i16> %x, %y 4314 store <3 x i16> %r, ptr addrspace(1) %out 4315 ret void 4316} 4317 4318define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x i16> %y) { 4319; CHECK-LABEL: @srem_v3i16( 4320; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4321; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4322; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4323; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4324; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4325; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4326; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4327; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4328; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4329; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4330; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4331; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4332; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4333; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4334; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4335; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4336; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4337; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4338; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4339; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4340; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4341; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4342; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4343; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4344; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4345; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> poison, i16 [[TMP25]], i64 0 4346; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 4347; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4348; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4349; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4350; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4351; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4352; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4353; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4354; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4355; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4356; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4357; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4358; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4359; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4360; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4361; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4362; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4363; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4364; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4365; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4366; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4367; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4368; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4369; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4370; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4371; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4372; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 4373; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4374; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4375; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4376; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4377; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4378; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4379; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4380; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4381; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4382; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4383; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4384; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4385; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4386; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4387; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4388; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4389; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4390; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4391; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4392; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4393; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4394; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4395; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4396; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4397; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4398; CHECK-NEXT: store <3 x i16> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8 4399; CHECK-NEXT: ret void 4400; 4401; GFX6-LABEL: srem_v3i16: 4402; GFX6: ; %bb.0: 4403; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 4404; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 4405; GFX6-NEXT: s_mov_b32 s3, 0xf000 4406; GFX6-NEXT: s_mov_b32 s2, -1 4407; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4408; GFX6-NEXT: s_sext_i32_i16 s4, s10 4409; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 4410; GFX6-NEXT: s_sext_i32_i16 s5, s8 4411; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 4412; GFX6-NEXT: s_xor_b32 s4, s5, s4 4413; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4414; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4415; GFX6-NEXT: s_or_b32 s6, s4, 1 4416; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4417; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4418; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4419; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4420; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 4421; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4422; GFX6-NEXT: s_cselect_b32 s4, s6, 0 4423; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 4424; GFX6-NEXT: s_ashr_i32 s4, s10, 16 4425; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 4426; GFX6-NEXT: s_ashr_i32 s5, s8, 16 4427; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 4428; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 4429; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4430; GFX6-NEXT: s_xor_b32 s4, s5, s4 4431; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4432; GFX6-NEXT: s_lshr_b32 s6, s8, 16 4433; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4434; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4435; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4436; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4437; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 4438; GFX6-NEXT: s_lshr_b32 s7, s10, 16 4439; GFX6-NEXT: s_or_b32 s8, s4, 1 4440; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| 4441; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4442; GFX6-NEXT: s_cselect_b32 s4, s8, 0 4443; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 4444; GFX6-NEXT: s_sext_i32_i16 s4, s11 4445; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4446; GFX6-NEXT: s_sext_i32_i16 s5, s9 4447; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 4448; GFX6-NEXT: s_xor_b32 s4, s5, s4 4449; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4450; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4451; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 4452; GFX6-NEXT: s_or_b32 s7, s4, 1 4453; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4454; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4455; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4456; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4457; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| 4458; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 4459; GFX6-NEXT: s_cselect_b32 s4, s7, 0 4460; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 4461; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 4462; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 4463; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4464; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 4465; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4466; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4467; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4468; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4469; GFX6-NEXT: s_endpgm 4470; 4471; GFX9-LABEL: srem_v3i16: 4472; GFX9: ; %bb.0: 4473; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 4474; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 4475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4476; GFX9-NEXT: s_sext_i32_i16 s8, s2 4477; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 4478; GFX9-NEXT: s_sext_i32_i16 s9, s0 4479; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 4480; GFX9-NEXT: s_xor_b32 s4, s9, s8 4481; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4482; GFX9-NEXT: s_ashr_i32 s4, s4, 30 4483; GFX9-NEXT: s_or_b32 s10, s4, 1 4484; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4485; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4486; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4487; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| 4488; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 4489; GFX9-NEXT: s_cselect_b32 s4, s10, 0 4490; GFX9-NEXT: s_ashr_i32 s10, s0, 16 4491; GFX9-NEXT: s_ashr_i32 s0, s2, 16 4492; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4493; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4494; GFX9-NEXT: s_xor_b32 s2, s10, s0 4495; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4496; GFX9-NEXT: v_add_u32_e32 v1, s4, v2 4497; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s10 4498; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4499; GFX9-NEXT: s_or_b32 s2, s2, 1 4500; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 4501; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4502; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4503; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4504; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4505; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0| 4506; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 4507; GFX9-NEXT: s_cselect_b32 s2, s2, 0 4508; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 4509; GFX9-NEXT: s_sext_i32_i16 s2, s3 4510; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s2 4511; GFX9-NEXT: s_sext_i32_i16 s3, s1 4512; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 4513; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 4514; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 4515; GFX9-NEXT: s_xor_b32 s0, s3, s2 4516; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4517; GFX9-NEXT: s_or_b32 s4, s0, 1 4518; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4519; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4520; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 4521; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4522; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 4523; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4524; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4525; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 4526; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 4527; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 4528; GFX9-NEXT: v_mov_b32_e32 v3, 0 4529; GFX9-NEXT: v_sub_u32_e32 v0, s10, v0 4530; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 4531; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4532; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 4533; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 4534; GFX9-NEXT: global_store_dword v3, v0, s[6:7] 4535; GFX9-NEXT: s_endpgm 4536 %r = srem <3 x i16> %x, %y 4537 store <3 x i16> %r, ptr addrspace(1) %out 4538 ret void 4539} 4540 4541define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) { 4542; CHECK-LABEL: @udiv_v3i15( 4543; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4544; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4545; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4546; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4547; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4548; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4549; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4550; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4551; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4552; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4553; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4554; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4555; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4556; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4557; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4558; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4559; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4560; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 4561; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 4562; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> poison, i15 [[TMP19]], i64 0 4563; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 4564; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4565; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 4566; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 4567; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4568; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4569; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4570; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4571; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4572; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4573; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4574; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4575; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4576; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4577; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4578; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4579; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4580; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 4581; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 4582; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 4583; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 4584; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4585; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 4586; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 4587; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4588; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4589; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4590; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4591; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4592; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4593; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4594; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4595; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4596; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4597; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4598; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4599; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4600; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 4601; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 4602; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 4603; CHECK-NEXT: store <3 x i15> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8 4604; CHECK-NEXT: ret void 4605; 4606; GFX6-LABEL: udiv_v3i15: 4607; GFX6: ; %bb.0: 4608; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 4609; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 4610; GFX6-NEXT: s_mov_b32 s3, 0xf000 4611; GFX6-NEXT: s_mov_b32 s2, -1 4612; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4613; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff 4614; GFX6-NEXT: s_and_b32 s7, s4, 0x7fff 4615; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 4616; GFX6-NEXT: v_mov_b32_e32 v2, s4 4617; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf000f 4618; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 4619; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4620; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 4621; GFX6-NEXT: s_bfe_u32 s7, s10, 0xf000f 4622; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 4623; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4624; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s7 4625; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 4626; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4627; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4628; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4629; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4630; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 4631; GFX6-NEXT: v_mov_b32_e32 v0, s10 4632; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 4633; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4634; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 4635; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4636; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4637; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4638; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 4639; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4640; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 4641; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 4642; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 4643; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4644; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 4645; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4646; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 4647; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 4648; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 4649; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 4650; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 4651; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4652; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4653; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4654; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 4655; GFX6-NEXT: s_mov_b32 s0, s8 4656; GFX6-NEXT: s_mov_b32 s1, s9 4657; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4658; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4659; GFX6-NEXT: s_waitcnt expcnt(0) 4660; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4661; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 4662; GFX6-NEXT: s_endpgm 4663; 4664; GFX9-LABEL: udiv_v3i15: 4665; GFX9: ; %bb.0: 4666; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4667; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4668; GFX9-NEXT: v_mov_b32_e32 v2, 0 4669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4670; GFX9-NEXT: v_mov_b32_e32 v0, s2 4671; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff 4672; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 4673; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff 4674; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4675; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f 4676; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 4677; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4678; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 4679; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f 4680; GFX9-NEXT: v_mov_b32_e32 v3, s6 4681; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4682; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4683; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 4684; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4685; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4686; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4687; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4688; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4689; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 4690; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4691; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 4692; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4693; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4694; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 4695; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 4696; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4697; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 4698; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 4699; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 4700; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 4701; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 4702; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4703; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 4704; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 4705; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 4706; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4707; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4708; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 4709; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4710; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4711; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4712; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4713; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 4714; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4715; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 4716; GFX9-NEXT: s_endpgm 4717 %r = udiv <3 x i15> %x, %y 4718 store <3 x i15> %r, ptr addrspace(1) %out 4719 ret void 4720} 4721 4722define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) { 4723; CHECK-LABEL: @urem_v3i15( 4724; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4725; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4726; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4727; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4728; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4729; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4730; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4731; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4732; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4733; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4734; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4735; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4736; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4737; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4738; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4739; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4740; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4741; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 4742; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 4743; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 4744; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 4745; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> poison, i15 [[TMP21]], i64 0 4746; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 4747; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4748; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 4749; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 4750; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 4751; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 4752; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 4753; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 4754; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 4755; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 4756; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 4757; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 4758; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4759; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 4760; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 4761; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 4762; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 4763; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 4764; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 4765; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 4766; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 4767; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 4768; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 4769; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4770; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 4771; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 4772; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 4773; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 4774; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 4775; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 4776; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 4777; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 4778; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 4779; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 4780; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 4781; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 4782; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 4783; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 4784; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 4785; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4786; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4787; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 4788; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 4789; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 4790; CHECK-NEXT: store <3 x i15> [[TMP66]], ptr addrspace(1) [[OUT:%.*]], align 8 4791; CHECK-NEXT: ret void 4792; 4793; GFX6-LABEL: urem_v3i15: 4794; GFX6: ; %bb.0: 4795; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 4796; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 4797; GFX6-NEXT: s_mov_b32 s3, 0xf000 4798; GFX6-NEXT: s_mov_b32 s2, -1 4799; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4800; GFX6-NEXT: s_mov_b32 s0, s8 4801; GFX6-NEXT: s_and_b32 s8, s4, 0x7fff 4802; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 4803; GFX6-NEXT: s_and_b32 s7, s10, 0x7fff 4804; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 4805; GFX6-NEXT: v_mov_b32_e32 v2, s4 4806; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4807; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 4808; GFX6-NEXT: s_bfe_u32 s5, s4, 0xf000f 4809; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 4810; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4811; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4812; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4813; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4814; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4815; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f 4816; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 4817; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 4818; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 4819; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4820; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4821; GFX6-NEXT: v_mov_b32_e32 v0, s10 4822; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 4823; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 4824; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 4825; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 4826; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4827; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 4828; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 4829; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4830; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 4831; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4832; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4833; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4834; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4835; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 4836; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4837; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 4838; GFX6-NEXT: s_lshr_b32 s4, s4, 15 4839; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 4840; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 4841; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 4842; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4843; GFX6-NEXT: s_lshr_b32 s6, s10, 15 4844; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 4845; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 4846; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4847; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4848; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 4849; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4850; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 4851; GFX6-NEXT: s_mov_b32 s1, s9 4852; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4853; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4854; GFX6-NEXT: s_waitcnt expcnt(0) 4855; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4856; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 4857; GFX6-NEXT: s_endpgm 4858; 4859; GFX9-LABEL: urem_v3i15: 4860; GFX9: ; %bb.0: 4861; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 4862; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 4863; GFX9-NEXT: v_mov_b32_e32 v2, 0 4864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4865; GFX9-NEXT: v_mov_b32_e32 v0, s2 4866; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4867; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff 4868; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 4869; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff 4870; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 4871; GFX9-NEXT: s_bfe_u32 s4, s6, 0xf000f 4872; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4873; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4 4874; GFX9-NEXT: v_mov_b32_e32 v3, s6 4875; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4876; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4877; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4878; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4879; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4880; GFX9-NEXT: s_bfe_u32 s5, s2, 0xf000f 4881; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4882; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4883; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 4884; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4885; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 4886; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 4887; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4888; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 4889; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 4890; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 4891; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4892; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 4893; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 4894; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 4895; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 4896; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4897; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 4898; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 4899; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 4900; GFX9-NEXT: s_lshr_b32 s3, s6, 15 4901; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 4902; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 4903; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 4904; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4905; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 4906; GFX9-NEXT: s_lshr_b32 s3, s2, 15 4907; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 4908; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 4909; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 4910; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 4911; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4912; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 4913; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4914; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4915; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4916; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 4917; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4918; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 4919; GFX9-NEXT: s_endpgm 4920 %r = urem <3 x i15> %x, %y 4921 store <3 x i15> %r, ptr addrspace(1) %out 4922 ret void 4923} 4924 4925define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) { 4926; CHECK-LABEL: @sdiv_v3i15( 4927; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4928; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4929; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 4930; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 4931; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4932; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4933; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4934; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4935; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4936; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4937; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4938; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4939; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4940; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4941; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4942; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4943; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4944; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4945; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4946; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4947; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 4948; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 4949; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 4950; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> poison, i15 [[TMP23]], i64 0 4951; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 4952; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4953; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 4954; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 4955; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4956; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4957; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4958; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4959; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4960; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4961; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4962; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4963; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4964; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4965; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4966; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4967; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4968; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4969; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4970; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4971; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 4972; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 4973; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 4974; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 4975; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 4976; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4977; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 4978; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 4979; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4980; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4981; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4982; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4983; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4984; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4985; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4986; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4987; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4988; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4989; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4990; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4991; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4992; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4993; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4994; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4995; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 4996; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 4997; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 4998; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 4999; CHECK-NEXT: store <3 x i15> [[TMP72]], ptr addrspace(1) [[OUT:%.*]], align 8 5000; CHECK-NEXT: ret void 5001; 5002; GFX6-LABEL: sdiv_v3i15: 5003; GFX6: ; %bb.0: 5004; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5005; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 5006; GFX6-NEXT: s_mov_b32 s3, 0xf000 5007; GFX6-NEXT: s_mov_b32 s2, -1 5008; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5009; GFX6-NEXT: v_mov_b32_e32 v0, s10 5010; GFX6-NEXT: s_bfe_i32 s6, s4, 0xf0000 5011; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 5012; GFX6-NEXT: v_mov_b32_e32 v1, s4 5013; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 30 5014; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf0000 5015; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 5016; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5017; GFX6-NEXT: s_xor_b32 s5, s5, s6 5018; GFX6-NEXT: s_ashr_i32 s5, s5, 30 5019; GFX6-NEXT: s_or_b32 s5, s5, 1 5020; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5021; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5022; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5023; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2| 5024; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec 5025; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5026; GFX6-NEXT: s_cselect_b32 s5, s5, 0 5027; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f 5028; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 5029; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v4 5030; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f 5031; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 5032; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 5033; GFX6-NEXT: s_xor_b32 s4, s5, s4 5034; GFX6-NEXT: s_ashr_i32 s4, s4, 30 5035; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 5036; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5037; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5038; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 5039; GFX6-NEXT: s_or_b32 s6, s4, 1 5040; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5041; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| 5042; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 5043; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 5044; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 5045; GFX6-NEXT: s_cselect_b32 s4, s6, 0 5046; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5047; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 5048; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 5049; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 5050; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 5051; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5052; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5053; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 5054; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5055; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5 5056; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 5057; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| 5058; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5059; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 5060; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5061; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5062; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5063; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5064; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 5065; GFX6-NEXT: s_mov_b32 s0, s8 5066; GFX6-NEXT: s_mov_b32 s1, s9 5067; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5068; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5069; GFX6-NEXT: s_waitcnt expcnt(0) 5070; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5071; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 5072; GFX6-NEXT: s_endpgm 5073; 5074; GFX9-LABEL: sdiv_v3i15: 5075; GFX9: ; %bb.0: 5076; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5077; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5078; GFX9-NEXT: v_mov_b32_e32 v2, 0 5079; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5080; GFX9-NEXT: v_mov_b32_e32 v0, s2 5081; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf0000 5082; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 5083; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5084; GFX9-NEXT: s_bfe_i32 s3, s2, 0xf0000 5085; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 5086; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5087; GFX9-NEXT: s_xor_b32 s3, s3, s4 5088; GFX9-NEXT: s_ashr_i32 s3, s3, 30 5089; GFX9-NEXT: s_or_b32 s3, s3, 1 5090; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5091; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5092; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5093; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| 5094; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 5095; GFX9-NEXT: s_cselect_b32 s3, s3, 0 5096; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf000f 5097; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5098; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 5099; GFX9-NEXT: s_bfe_i32 s2, s2, 0xf000f 5100; GFX9-NEXT: v_mov_b32_e32 v1, s6 5101; GFX9-NEXT: v_add_u32_e32 v4, s3, v5 5102; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 5103; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 5104; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5105; GFX9-NEXT: s_xor_b32 s2, s2, s4 5106; GFX9-NEXT: s_ashr_i32 s2, s2, 30 5107; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5108; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5109; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 5110; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5111; GFX9-NEXT: s_or_b32 s4, s2, 1 5112; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5113; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v3| 5114; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 5115; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 5116; GFX9-NEXT: s_cselect_b32 s2, s4, 0 5117; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 5118; GFX9-NEXT: v_add_u32_e32 v5, s2, v6 5119; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 5120; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 5121; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 5122; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5123; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 5124; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 5125; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5126; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 5127; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 5128; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 5129; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5130; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 5131; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5132; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 5133; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5134; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5135; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5136; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5137; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 5138; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5139; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 5140; GFX9-NEXT: s_endpgm 5141 %r = sdiv <3 x i15> %x, %y 5142 store <3 x i15> %r, ptr addrspace(1) %out 5143 ret void 5144} 5145 5146define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x i15> %y) { 5147; CHECK-LABEL: @srem_v3i15( 5148; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5149; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5150; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 5151; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 5152; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5153; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5154; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5155; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5156; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5157; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5158; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5159; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5160; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5161; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5162; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5163; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5164; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5165; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5166; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5167; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5168; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5169; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5170; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 5171; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 5172; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 5173; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> poison, i15 [[TMP25]], i64 0 5174; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 5175; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5176; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 5177; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 5178; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5179; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5180; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5181; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5182; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5183; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5184; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5185; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5186; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5187; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5188; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5189; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5190; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5191; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5192; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5193; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5194; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5195; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5196; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 5197; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 5198; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 5199; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 5200; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 5201; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5202; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 5203; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 5204; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5205; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5206; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5207; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5208; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5209; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5210; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5211; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5212; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5213; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5214; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5215; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5216; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5217; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5218; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5219; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5220; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5221; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5222; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 5223; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 5224; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 5225; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 5226; CHECK-NEXT: store <3 x i15> [[TMP78]], ptr addrspace(1) [[OUT:%.*]], align 8 5227; CHECK-NEXT: ret void 5228; 5229; GFX6-LABEL: srem_v3i15: 5230; GFX6: ; %bb.0: 5231; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 5232; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 5233; GFX6-NEXT: s_mov_b32 s3, 0xf000 5234; GFX6-NEXT: s_mov_b32 s2, -1 5235; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5236; GFX6-NEXT: s_bfe_i32 s6, s10, 0xf0000 5237; GFX6-NEXT: v_mov_b32_e32 v2, s4 5238; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 5239; GFX6-NEXT: s_bfe_i32 s5, s4, 0xf0000 5240; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 5241; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s6 5242; GFX6-NEXT: s_xor_b32 s5, s6, s5 5243; GFX6-NEXT: s_ashr_i32 s5, s5, 30 5244; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5245; GFX6-NEXT: s_mov_b32 s0, s8 5246; GFX6-NEXT: s_mov_b32 s1, s9 5247; GFX6-NEXT: s_lshr_b32 s8, s10, 15 5248; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 5249; GFX6-NEXT: v_trunc_f32_e32 v6, v6 5250; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 5251; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 5252; GFX6-NEXT: s_lshr_b32 s9, s4, 15 5253; GFX6-NEXT: s_or_b32 s5, s5, 1 5254; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v5|, |v4| 5255; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec 5256; GFX6-NEXT: s_cselect_b32 s5, s5, 0 5257; GFX6-NEXT: v_add_i32_e32 v4, vcc, s5, v6 5258; GFX6-NEXT: v_mul_lo_u32 v4, v4, s4 5259; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f 5260; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 5261; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f 5262; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 5263; GFX6-NEXT: s_xor_b32 s4, s5, s4 5264; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5265; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 5266; GFX6-NEXT: s_ashr_i32 s4, s4, 30 5267; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 5268; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 5269; GFX6-NEXT: v_trunc_f32_e32 v7, v7 5270; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 5271; GFX6-NEXT: s_or_b32 s6, s4, 1 5272; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 5273; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| 5274; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 5275; GFX6-NEXT: v_mov_b32_e32 v0, s10 5276; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 5277; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec 5278; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 5279; GFX6-NEXT: s_cselect_b32 s4, s6, 0 5280; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5281; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 5282; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 5283; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5284; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 5285; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 5286; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 5287; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5288; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 5289; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5290; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5291; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5292; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5293; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5294; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 5295; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5296; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 5297; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 5298; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 5299; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 5300; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5301; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5302; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 5303; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5304; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5305; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5306; GFX6-NEXT: s_waitcnt expcnt(0) 5307; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5308; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 5309; GFX6-NEXT: s_endpgm 5310; 5311; GFX9-LABEL: srem_v3i15: 5312; GFX9: ; %bb.0: 5313; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5314; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 5315; GFX9-NEXT: v_mov_b32_e32 v2, 0 5316; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5317; GFX9-NEXT: v_mov_b32_e32 v0, s2 5318; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5319; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf0000 5320; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 5321; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf0000 5322; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s4 5323; GFX9-NEXT: s_xor_b32 s3, s4, s3 5324; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5325; GFX9-NEXT: v_mov_b32_e32 v1, s6 5326; GFX9-NEXT: s_ashr_i32 s3, s3, 30 5327; GFX9-NEXT: s_lshr_b32 s8, s2, 15 5328; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5329; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5330; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 5331; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5332; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5333; GFX9-NEXT: s_lshr_b32 s7, s6, 15 5334; GFX9-NEXT: s_or_b32 s3, s3, 1 5335; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| 5336; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 5337; GFX9-NEXT: s_cselect_b32 s3, s3, 0 5338; GFX9-NEXT: v_add_u32_e32 v4, s3, v6 5339; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf000f 5340; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s3 5341; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf000f 5342; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s4 5343; GFX9-NEXT: s_xor_b32 s3, s4, s3 5344; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5345; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 5346; GFX9-NEXT: s_ashr_i32 s3, s3, 30 5347; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5348; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 5349; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5350; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 5351; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 5352; GFX9-NEXT: s_or_b32 s3, s3, 1 5353; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| 5354; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 5355; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec 5356; GFX9-NEXT: s_cselect_b32 s3, s3, 0 5357; GFX9-NEXT: v_add_u32_e32 v5, s3, v7 5358; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 5359; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 5360; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 5361; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 5362; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 5363; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 5364; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 5365; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5366; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 5367; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 5368; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5369; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5370; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 5371; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 5372; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 5373; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 5374; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 5375; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 5376; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 5377; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 5378; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 5379; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5380; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 5381; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5382; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5383; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5384; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 5385; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5386; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 5387; GFX9-NEXT: s_endpgm 5388 %r = srem <3 x i15> %x, %y 5389 store <3 x i15> %r, ptr addrspace(1) %out 5390 ret void 5391} 5392 5393define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { 5394; CHECK-LABEL: @udiv_i32_oddk_denom( 5395; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 5396; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5397; CHECK-NEXT: ret void 5398; 5399; GFX6-LABEL: udiv_i32_oddk_denom: 5400; GFX6: ; %bb.0: 5401; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 5402; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5403; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5404; GFX6-NEXT: s_mov_b32 s3, 0xf000 5405; GFX6-NEXT: s_mov_b32 s2, -1 5406; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5407; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 5408; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0 5409; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5410; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 5411; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5412; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5413; GFX6-NEXT: s_endpgm 5414; 5415; GFX9-LABEL: udiv_i32_oddk_denom: 5416; GFX9: ; %bb.0: 5417; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 5418; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5419; GFX9-NEXT: v_mov_b32_e32 v0, 0 5420; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5421; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881 5422; GFX9-NEXT: s_sub_i32 s2, s2, s3 5423; GFX9-NEXT: s_lshr_b32 s2, s2, 1 5424; GFX9-NEXT: s_add_i32 s2, s2, s3 5425; GFX9-NEXT: s_lshr_b32 s2, s2, 20 5426; GFX9-NEXT: v_mov_b32_e32 v1, s2 5427; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5428; GFX9-NEXT: s_endpgm 5429 %r = udiv i32 %x, 1235195 5430 store i32 %r, ptr addrspace(1) %out 5431 ret void 5432} 5433 5434define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { 5435; CHECK-LABEL: @udiv_i32_pow2k_denom( 5436; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 5437; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5438; CHECK-NEXT: ret void 5439; 5440; GFX6-LABEL: udiv_i32_pow2k_denom: 5441; GFX6: ; %bb.0: 5442; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 5443; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5444; GFX6-NEXT: s_mov_b32 s3, 0xf000 5445; GFX6-NEXT: s_mov_b32 s2, -1 5446; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5447; GFX6-NEXT: s_lshr_b32 s4, s6, 12 5448; GFX6-NEXT: v_mov_b32_e32 v0, s4 5449; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5450; GFX6-NEXT: s_endpgm 5451; 5452; GFX9-LABEL: udiv_i32_pow2k_denom: 5453; GFX9: ; %bb.0: 5454; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 5455; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5456; GFX9-NEXT: v_mov_b32_e32 v0, 0 5457; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5458; GFX9-NEXT: s_lshr_b32 s2, s2, 12 5459; GFX9-NEXT: v_mov_b32_e32 v1, s2 5460; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5461; GFX9-NEXT: s_endpgm 5462 %r = udiv i32 %x, 4096 5463 store i32 %r, ptr addrspace(1) %out 5464 ret void 5465} 5466 5467define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) { 5468; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 5469; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5470; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 5471; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5472; CHECK-NEXT: ret void 5473; 5474; GFX6-LABEL: udiv_i32_pow2_shl_denom: 5475; GFX6: ; %bb.0: 5476; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5477; GFX6-NEXT: s_mov_b32 s7, 0xf000 5478; GFX6-NEXT: s_mov_b32 s6, -1 5479; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5480; GFX6-NEXT: s_mov_b32 s4, s0 5481; GFX6-NEXT: s_add_i32 s0, s3, 12 5482; GFX6-NEXT: s_lshr_b32 s0, s2, s0 5483; GFX6-NEXT: s_mov_b32 s5, s1 5484; GFX6-NEXT: v_mov_b32_e32 v0, s0 5485; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5486; GFX6-NEXT: s_endpgm 5487; 5488; GFX9-LABEL: udiv_i32_pow2_shl_denom: 5489; GFX9: ; %bb.0: 5490; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5491; GFX9-NEXT: v_mov_b32_e32 v0, 0 5492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5493; GFX9-NEXT: s_add_i32 s3, s3, 12 5494; GFX9-NEXT: s_lshr_b32 s2, s2, s3 5495; GFX9-NEXT: v_mov_b32_e32 v1, s2 5496; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5497; GFX9-NEXT: s_endpgm 5498 %shl.y = shl i32 4096, %y 5499 %r = udiv i32 %x, %shl.y 5500 store i32 %r, ptr addrspace(1) %out 5501 ret void 5502} 5503 5504define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 5505; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 5506; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5507; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5508; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 5509; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5510; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 5511; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5512; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 5513; CHECK-NEXT: ret void 5514; 5515; GFX6-LABEL: udiv_v2i32_pow2k_denom: 5516; GFX6: ; %bb.0: 5517; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5518; GFX6-NEXT: s_mov_b32 s7, 0xf000 5519; GFX6-NEXT: s_mov_b32 s6, -1 5520; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5521; GFX6-NEXT: s_mov_b32 s4, s0 5522; GFX6-NEXT: s_mov_b32 s5, s1 5523; GFX6-NEXT: s_lshr_b32 s0, s2, 12 5524; GFX6-NEXT: s_lshr_b32 s1, s3, 12 5525; GFX6-NEXT: v_mov_b32_e32 v0, s0 5526; GFX6-NEXT: v_mov_b32_e32 v1, s1 5527; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5528; GFX6-NEXT: s_endpgm 5529; 5530; GFX9-LABEL: udiv_v2i32_pow2k_denom: 5531; GFX9: ; %bb.0: 5532; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5533; GFX9-NEXT: v_mov_b32_e32 v2, 0 5534; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5535; GFX9-NEXT: s_lshr_b32 s2, s2, 12 5536; GFX9-NEXT: s_lshr_b32 s3, s3, 12 5537; GFX9-NEXT: v_mov_b32_e32 v0, s2 5538; GFX9-NEXT: v_mov_b32_e32 v1, s3 5539; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 5540; GFX9-NEXT: s_endpgm 5541 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 5542 store <2 x i32> %r, ptr addrspace(1) %out 5543 ret void 5544} 5545 5546define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 5547; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 5548; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5549; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5550; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 5551; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5552; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 5553; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5554; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 5555; CHECK-NEXT: ret void 5556; 5557; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 5558; GFX6: ; %bb.0: 5559; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5560; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 5561; GFX6-NEXT: s_mov_b32 s7, 0xf000 5562; GFX6-NEXT: s_mov_b32 s6, -1 5563; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5564; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 5565; GFX6-NEXT: s_mov_b32 s4, s0 5566; GFX6-NEXT: s_lshr_b32 s0, s2, 12 5567; GFX6-NEXT: s_mov_b32 s5, s1 5568; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 5569; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5570; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 5571; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 5572; GFX6-NEXT: v_mov_b32_e32 v0, s0 5573; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5574; GFX6-NEXT: s_endpgm 5575; 5576; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 5577; GFX9: ; %bb.0: 5578; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5579; GFX9-NEXT: v_mov_b32_e32 v2, 0 5580; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5581; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 5582; GFX9-NEXT: s_sub_i32 s3, s3, s4 5583; GFX9-NEXT: s_lshr_b32 s3, s3, 1 5584; GFX9-NEXT: s_add_i32 s3, s3, s4 5585; GFX9-NEXT: s_lshr_b32 s2, s2, 12 5586; GFX9-NEXT: s_lshr_b32 s3, s3, 11 5587; GFX9-NEXT: v_mov_b32_e32 v0, s2 5588; GFX9-NEXT: v_mov_b32_e32 v1, s3 5589; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 5590; GFX9-NEXT: s_endpgm 5591 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 5592 store <2 x i32> %r, ptr addrspace(1) %out 5593 ret void 5594} 5595 5596define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) { 5597; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 5598; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]] 5599; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5600; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5601; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5602; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5603; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5604; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5605; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5606; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5607; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5608; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5609; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5610; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5611; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5612; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5613; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5614; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5615; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5616; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5617; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5618; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5619; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5620; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5621; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5622; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5623; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 5624; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 5625; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5626; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 5627; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 5628; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 5629; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 5630; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> poison, i32 [[TMP31]], i64 0 5631; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 5632; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5633; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 5634; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5635; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 5636; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 5637; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 5638; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 5639; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 5640; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 5641; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 5642; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5643; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 5644; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 5645; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 5646; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 5647; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 5648; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 5649; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5650; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 5651; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 5652; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 5653; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 5654; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 5655; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 5656; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 5657; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 5658; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 5659; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 5660; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 5661; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 5662; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 5663; CHECK-NEXT: store <2 x i32> [[TMP64]], ptr addrspace(1) [[OUT:%.*]], align 8 5664; CHECK-NEXT: ret void 5665; 5666; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 5667; GFX6: ; %bb.0: 5668; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb 5669; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 5670; GFX6-NEXT: s_mov_b32 s7, 0xf000 5671; GFX6-NEXT: s_mov_b32 s6, -1 5672; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5673; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 5674; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s0 5675; GFX6-NEXT: s_sub_i32 s1, 0, s0 5676; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s11 5677; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 5678; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5679; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 5680; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5681; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5682; GFX6-NEXT: v_mul_lo_u32 v1, s1, v0 5683; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 5684; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5685; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 5686; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 5687; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5688; GFX6-NEXT: v_readfirstlane_b32 s1, v0 5689; GFX6-NEXT: s_mul_i32 s1, s1, s0 5690; GFX6-NEXT: s_sub_i32 s1, s8, s1 5691; GFX6-NEXT: s_sub_i32 s3, s1, s0 5692; GFX6-NEXT: s_cmp_ge_u32 s1, s0 5693; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 5694; GFX6-NEXT: s_cselect_b32 s1, s3, s1 5695; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 5696; GFX6-NEXT: s_cmp_ge_u32 s1, s0 5697; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 5698; GFX6-NEXT: s_sub_i32 s3, 0, s2 5699; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 5700; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5701; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 5702; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5703; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 5704; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5705; GFX6-NEXT: v_mul_hi_u32 v1, s9, v1 5706; GFX6-NEXT: v_readfirstlane_b32 s0, v1 5707; GFX6-NEXT: s_mul_i32 s0, s0, s2 5708; GFX6-NEXT: s_sub_i32 s0, s9, s0 5709; GFX6-NEXT: s_sub_i32 s1, s0, s2 5710; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 5711; GFX6-NEXT: s_cmp_ge_u32 s0, s2 5712; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 5713; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5714; GFX6-NEXT: s_cselect_b32 s0, s1, s0 5715; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 5716; GFX6-NEXT: s_cmp_ge_u32 s0, s2 5717; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 5718; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5719; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5720; GFX6-NEXT: s_endpgm 5721; 5722; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 5723; GFX9: ; %bb.0: 5724; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 5725; GFX9-NEXT: v_mov_b32_e32 v2, 0 5726; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5727; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 5728; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 5729; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 5730; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 5731; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 5732; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 5733; GFX9-NEXT: s_sub_i32 s4, 0, s7 5734; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 5735; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5736; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 5737; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5738; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5739; GFX9-NEXT: v_readfirstlane_b32 s5, v0 5740; GFX9-NEXT: s_mul_i32 s4, s4, s5 5741; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 5742; GFX9-NEXT: s_add_i32 s5, s5, s4 5743; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 5744; GFX9-NEXT: s_mul_i32 s5, s4, s7 5745; GFX9-NEXT: s_sub_i32 s0, s0, s5 5746; GFX9-NEXT: s_add_i32 s9, s4, 1 5747; GFX9-NEXT: s_sub_i32 s5, s0, s7 5748; GFX9-NEXT: s_cmp_ge_u32 s0, s7 5749; GFX9-NEXT: s_cselect_b32 s4, s9, s4 5750; GFX9-NEXT: s_cselect_b32 s0, s5, s0 5751; GFX9-NEXT: s_add_i32 s5, s4, 1 5752; GFX9-NEXT: s_cmp_ge_u32 s0, s7 5753; GFX9-NEXT: v_readfirstlane_b32 s8, v1 5754; GFX9-NEXT: s_cselect_b32 s0, s5, s4 5755; GFX9-NEXT: s_sub_i32 s4, 0, s6 5756; GFX9-NEXT: s_mul_i32 s4, s4, s8 5757; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 5758; GFX9-NEXT: s_add_i32 s8, s8, s4 5759; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 5760; GFX9-NEXT: s_mul_i32 s5, s4, s6 5761; GFX9-NEXT: s_sub_i32 s1, s1, s5 5762; GFX9-NEXT: s_add_i32 s7, s4, 1 5763; GFX9-NEXT: s_sub_i32 s5, s1, s6 5764; GFX9-NEXT: s_cmp_ge_u32 s1, s6 5765; GFX9-NEXT: s_cselect_b32 s4, s7, s4 5766; GFX9-NEXT: s_cselect_b32 s1, s5, s1 5767; GFX9-NEXT: s_add_i32 s5, s4, 1 5768; GFX9-NEXT: s_cmp_ge_u32 s1, s6 5769; GFX9-NEXT: s_cselect_b32 s1, s5, s4 5770; GFX9-NEXT: v_mov_b32_e32 v0, s0 5771; GFX9-NEXT: v_mov_b32_e32 v1, s1 5772; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5773; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5774; GFX9-NEXT: s_endpgm 5775 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5776 %r = udiv <2 x i32> %x, %shl.y 5777 store <2 x i32> %r, ptr addrspace(1) %out 5778 ret void 5779} 5780 5781define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { 5782; CHECK-LABEL: @urem_i32_oddk_denom( 5783; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 5784; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5785; CHECK-NEXT: ret void 5786; 5787; GFX6-LABEL: urem_i32_oddk_denom: 5788; GFX6: ; %bb.0: 5789; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 5790; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5791; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5792; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 5793; GFX6-NEXT: s_mov_b32 s3, 0xf000 5794; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5795; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 5796; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v0 5797; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5798; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 5799; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5800; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5801; GFX6-NEXT: s_mov_b32 s2, -1 5802; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 5803; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5804; GFX6-NEXT: s_endpgm 5805; 5806; GFX9-LABEL: urem_i32_oddk_denom: 5807; GFX9: ; %bb.0: 5808; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 5809; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5810; GFX9-NEXT: v_mov_b32_e32 v0, 0 5811; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5812; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xb2a50881 5813; GFX9-NEXT: s_sub_i32 s4, s2, s3 5814; GFX9-NEXT: s_lshr_b32 s4, s4, 1 5815; GFX9-NEXT: s_add_i32 s4, s4, s3 5816; GFX9-NEXT: s_lshr_b32 s3, s4, 20 5817; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb 5818; GFX9-NEXT: s_sub_i32 s2, s2, s3 5819; GFX9-NEXT: v_mov_b32_e32 v1, s2 5820; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5821; GFX9-NEXT: s_endpgm 5822 %r = urem i32 %x, 1235195 5823 store i32 %r, ptr addrspace(1) %out 5824 ret void 5825} 5826 5827define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { 5828; CHECK-LABEL: @urem_i32_pow2k_denom( 5829; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 5830; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5831; CHECK-NEXT: ret void 5832; 5833; GFX6-LABEL: urem_i32_pow2k_denom: 5834; GFX6: ; %bb.0: 5835; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 5836; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 5837; GFX6-NEXT: s_mov_b32 s3, 0xf000 5838; GFX6-NEXT: s_mov_b32 s2, -1 5839; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5840; GFX6-NEXT: s_and_b32 s4, s6, 0xfff 5841; GFX6-NEXT: v_mov_b32_e32 v0, s4 5842; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5843; GFX6-NEXT: s_endpgm 5844; 5845; GFX9-LABEL: urem_i32_pow2k_denom: 5846; GFX9: ; %bb.0: 5847; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 5848; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 5849; GFX9-NEXT: v_mov_b32_e32 v0, 0 5850; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5851; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 5852; GFX9-NEXT: v_mov_b32_e32 v1, s2 5853; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5854; GFX9-NEXT: s_endpgm 5855 %r = urem i32 %x, 4096 5856 store i32 %r, ptr addrspace(1) %out 5857 ret void 5858} 5859 5860define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) { 5861; CHECK-LABEL: @urem_i32_pow2_shl_denom( 5862; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5863; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 5864; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 5865; CHECK-NEXT: ret void 5866; 5867; GFX6-LABEL: urem_i32_pow2_shl_denom: 5868; GFX6: ; %bb.0: 5869; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5870; GFX6-NEXT: s_mov_b32 s7, 0xf000 5871; GFX6-NEXT: s_mov_b32 s6, -1 5872; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5873; GFX6-NEXT: s_mov_b32 s4, s0 5874; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3 5875; GFX6-NEXT: s_add_i32 s0, s0, -1 5876; GFX6-NEXT: s_and_b32 s0, s2, s0 5877; GFX6-NEXT: s_mov_b32 s5, s1 5878; GFX6-NEXT: v_mov_b32_e32 v0, s0 5879; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5880; GFX6-NEXT: s_endpgm 5881; 5882; GFX9-LABEL: urem_i32_pow2_shl_denom: 5883; GFX9: ; %bb.0: 5884; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5885; GFX9-NEXT: v_mov_b32_e32 v0, 0 5886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5887; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 5888; GFX9-NEXT: s_add_i32 s3, s3, -1 5889; GFX9-NEXT: s_and_b32 s2, s2, s3 5890; GFX9-NEXT: v_mov_b32_e32 v1, s2 5891; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 5892; GFX9-NEXT: s_endpgm 5893 %shl.y = shl i32 4096, %y 5894 %r = urem i32 %x, %shl.y 5895 store i32 %r, ptr addrspace(1) %out 5896 ret void 5897} 5898 5899define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 5900; CHECK-LABEL: @urem_v2i32_pow2k_denom( 5901; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5902; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 5903; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 5904; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5905; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 5906; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5907; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 5908; CHECK-NEXT: ret void 5909; 5910; GFX6-LABEL: urem_v2i32_pow2k_denom: 5911; GFX6: ; %bb.0: 5912; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 5913; GFX6-NEXT: s_mov_b32 s7, 0xf000 5914; GFX6-NEXT: s_mov_b32 s6, -1 5915; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5916; GFX6-NEXT: s_mov_b32 s4, s0 5917; GFX6-NEXT: s_mov_b32 s5, s1 5918; GFX6-NEXT: s_and_b32 s0, s2, 0xfff 5919; GFX6-NEXT: s_and_b32 s1, s3, 0xfff 5920; GFX6-NEXT: v_mov_b32_e32 v0, s0 5921; GFX6-NEXT: v_mov_b32_e32 v1, s1 5922; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5923; GFX6-NEXT: s_endpgm 5924; 5925; GFX9-LABEL: urem_v2i32_pow2k_denom: 5926; GFX9: ; %bb.0: 5927; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 5928; GFX9-NEXT: v_mov_b32_e32 v2, 0 5929; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5930; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 5931; GFX9-NEXT: s_and_b32 s3, s3, 0xfff 5932; GFX9-NEXT: v_mov_b32_e32 v0, s2 5933; GFX9-NEXT: v_mov_b32_e32 v1, s3 5934; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 5935; GFX9-NEXT: s_endpgm 5936 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 5937 store <2 x i32> %r, ptr addrspace(1) %out 5938 ret void 5939} 5940 5941define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) { 5942; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 5943; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]] 5944; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5945; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5946; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5947; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5948; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5949; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5950; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5951; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5952; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5953; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5954; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5955; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5956; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5957; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5958; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5959; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5960; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5961; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5962; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5963; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5964; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5965; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5966; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5967; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5968; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5969; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 5970; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 5971; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 5972; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 5973; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> poison, i32 [[TMP29]], i64 0 5974; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 5975; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5976; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 5977; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5978; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 5979; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 5980; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 5981; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 5982; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 5983; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 5984; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 5985; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 5986; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 5987; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5988; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 5989; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 5990; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 5991; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 5992; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 5993; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 5994; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5995; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 5996; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 5997; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 5998; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 5999; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 6000; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 6001; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 6002; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 6003; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 6004; CHECK-NEXT: store <2 x i32> [[TMP60]], ptr addrspace(1) [[OUT:%.*]], align 8 6005; CHECK-NEXT: ret void 6006; 6007; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 6008; GFX6: ; %bb.0: 6009; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6010; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 6011; GFX6-NEXT: s_mov_b32 s7, 0xf000 6012; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6013; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 6014; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6015; GFX6-NEXT: s_sub_i32 s6, 0, s2 6016; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6017; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 6018; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6019; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6020; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6021; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6022; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 6023; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6024; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6025; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6026; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 6027; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6028; GFX6-NEXT: v_readfirstlane_b32 s6, v0 6029; GFX6-NEXT: s_mul_i32 s6, s6, s2 6030; GFX6-NEXT: s_sub_i32 s0, s0, s6 6031; GFX6-NEXT: s_sub_i32 s6, s0, s2 6032; GFX6-NEXT: s_cmp_ge_u32 s0, s2 6033; GFX6-NEXT: s_cselect_b32 s0, s6, s0 6034; GFX6-NEXT: s_sub_i32 s6, s0, s2 6035; GFX6-NEXT: s_cmp_ge_u32 s0, s2 6036; GFX6-NEXT: s_cselect_b32 s0, s6, s0 6037; GFX6-NEXT: s_sub_i32 s2, 0, s3 6038; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 6039; GFX6-NEXT: s_mov_b32 s6, -1 6040; GFX6-NEXT: v_mul_hi_u32 v0, v1, v0 6041; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6042; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 6043; GFX6-NEXT: v_readfirstlane_b32 s2, v0 6044; GFX6-NEXT: s_mul_i32 s2, s2, s3 6045; GFX6-NEXT: s_sub_i32 s1, s1, s2 6046; GFX6-NEXT: s_sub_i32 s2, s1, s3 6047; GFX6-NEXT: s_cmp_ge_u32 s1, s3 6048; GFX6-NEXT: s_cselect_b32 s1, s2, s1 6049; GFX6-NEXT: s_sub_i32 s2, s1, s3 6050; GFX6-NEXT: s_cmp_ge_u32 s1, s3 6051; GFX6-NEXT: s_cselect_b32 s1, s2, s1 6052; GFX6-NEXT: v_mov_b32_e32 v0, s0 6053; GFX6-NEXT: v_mov_b32_e32 v1, s1 6054; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6055; GFX6-NEXT: s_endpgm 6056; 6057; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 6058; GFX9: ; %bb.0: 6059; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6060; GFX9-NEXT: v_mov_b32_e32 v2, 0 6061; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6062; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s2 6063; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 6064; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s3 6065; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 6066; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 6067; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6068; GFX9-NEXT: s_sub_i32 s4, 0, s7 6069; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6070; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6071; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6072; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6073; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6074; GFX9-NEXT: v_readfirstlane_b32 s5, v0 6075; GFX9-NEXT: s_mul_i32 s4, s4, s5 6076; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 6077; GFX9-NEXT: s_add_i32 s5, s5, s4 6078; GFX9-NEXT: s_mul_hi_u32 s4, s0, s5 6079; GFX9-NEXT: s_mul_i32 s4, s4, s7 6080; GFX9-NEXT: s_sub_i32 s0, s0, s4 6081; GFX9-NEXT: s_sub_i32 s4, s0, s7 6082; GFX9-NEXT: s_cmp_ge_u32 s0, s7 6083; GFX9-NEXT: s_cselect_b32 s0, s4, s0 6084; GFX9-NEXT: s_sub_i32 s4, s0, s7 6085; GFX9-NEXT: s_cmp_ge_u32 s0, s7 6086; GFX9-NEXT: v_readfirstlane_b32 s8, v1 6087; GFX9-NEXT: s_cselect_b32 s0, s4, s0 6088; GFX9-NEXT: s_sub_i32 s4, 0, s6 6089; GFX9-NEXT: s_mul_i32 s4, s4, s8 6090; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 6091; GFX9-NEXT: s_add_i32 s8, s8, s4 6092; GFX9-NEXT: s_mul_hi_u32 s4, s1, s8 6093; GFX9-NEXT: s_mul_i32 s4, s4, s6 6094; GFX9-NEXT: s_sub_i32 s1, s1, s4 6095; GFX9-NEXT: s_sub_i32 s4, s1, s6 6096; GFX9-NEXT: s_cmp_ge_u32 s1, s6 6097; GFX9-NEXT: s_cselect_b32 s1, s4, s1 6098; GFX9-NEXT: s_sub_i32 s4, s1, s6 6099; GFX9-NEXT: s_cmp_ge_u32 s1, s6 6100; GFX9-NEXT: s_cselect_b32 s1, s4, s1 6101; GFX9-NEXT: v_mov_b32_e32 v0, s0 6102; GFX9-NEXT: v_mov_b32_e32 v1, s1 6103; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6104; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6105; GFX9-NEXT: s_endpgm 6106 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6107 %r = urem <2 x i32> %x, %shl.y 6108 store <2 x i32> %r, ptr addrspace(1) %out 6109 ret void 6110} 6111 6112define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { 6113; CHECK-LABEL: @sdiv_i32_oddk_denom( 6114; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 6115; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6116; CHECK-NEXT: ret void 6117; 6118; GFX6-LABEL: sdiv_i32_oddk_denom: 6119; GFX6: ; %bb.0: 6120; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 6121; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6122; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6123; GFX6-NEXT: s_mov_b32 s3, 0xf000 6124; GFX6-NEXT: s_mov_b32 s2, -1 6125; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6126; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0 6127; GFX6-NEXT: v_add_i32_e32 v0, vcc, s6, v0 6128; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6129; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6130; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6131; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6132; GFX6-NEXT: s_endpgm 6133; 6134; GFX9-LABEL: sdiv_i32_oddk_denom: 6135; GFX9: ; %bb.0: 6136; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 6137; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6138; GFX9-NEXT: v_mov_b32_e32 v0, 0 6139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6140; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441 6141; GFX9-NEXT: s_add_i32 s3, s3, s2 6142; GFX9-NEXT: s_lshr_b32 s2, s3, 31 6143; GFX9-NEXT: s_ashr_i32 s3, s3, 20 6144; GFX9-NEXT: s_add_i32 s2, s3, s2 6145; GFX9-NEXT: v_mov_b32_e32 v1, s2 6146; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 6147; GFX9-NEXT: s_endpgm 6148 %r = sdiv i32 %x, 1235195 6149 store i32 %r, ptr addrspace(1) %out 6150 ret void 6151} 6152 6153define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { 6154; CHECK-LABEL: @sdiv_i32_pow2k_denom( 6155; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 6156; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6157; CHECK-NEXT: ret void 6158; 6159; GFX6-LABEL: sdiv_i32_pow2k_denom: 6160; GFX6: ; %bb.0: 6161; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 6162; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6163; GFX6-NEXT: s_mov_b32 s3, 0xf000 6164; GFX6-NEXT: s_mov_b32 s2, -1 6165; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6166; GFX6-NEXT: s_ashr_i32 s4, s6, 31 6167; GFX6-NEXT: s_lshr_b32 s4, s4, 20 6168; GFX6-NEXT: s_add_i32 s6, s6, s4 6169; GFX6-NEXT: s_ashr_i32 s4, s6, 12 6170; GFX6-NEXT: v_mov_b32_e32 v0, s4 6171; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6172; GFX6-NEXT: s_endpgm 6173; 6174; GFX9-LABEL: sdiv_i32_pow2k_denom: 6175; GFX9: ; %bb.0: 6176; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 6177; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6178; GFX9-NEXT: v_mov_b32_e32 v0, 0 6179; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6180; GFX9-NEXT: s_ashr_i32 s3, s2, 31 6181; GFX9-NEXT: s_lshr_b32 s3, s3, 20 6182; GFX9-NEXT: s_add_i32 s2, s2, s3 6183; GFX9-NEXT: s_ashr_i32 s2, s2, 12 6184; GFX9-NEXT: v_mov_b32_e32 v1, s2 6185; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 6186; GFX9-NEXT: s_endpgm 6187 %r = sdiv i32 %x, 4096 6188 store i32 %r, ptr addrspace(1) %out 6189 ret void 6190} 6191 6192define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) { 6193; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 6194; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6195; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 6196; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6197; CHECK-NEXT: ret void 6198; 6199; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 6200; GFX6: ; %bb.0: 6201; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6202; GFX6-NEXT: s_mov_b32 s7, 0xf000 6203; GFX6-NEXT: s_mov_b32 s6, -1 6204; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6205; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6206; GFX6-NEXT: s_ashr_i32 s8, s3, 31 6207; GFX6-NEXT: s_add_i32 s3, s3, s8 6208; GFX6-NEXT: s_xor_b32 s3, s3, s8 6209; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 6210; GFX6-NEXT: s_sub_i32 s4, 0, s3 6211; GFX6-NEXT: s_ashr_i32 s9, s2, 31 6212; GFX6-NEXT: s_add_i32 s2, s2, s9 6213; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6214; GFX6-NEXT: s_xor_b32 s2, s2, s9 6215; GFX6-NEXT: s_mov_b32 s5, s1 6216; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6217; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6218; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 6219; GFX6-NEXT: s_mov_b32 s4, s0 6220; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6221; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6222; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 6223; GFX6-NEXT: v_readfirstlane_b32 s0, v0 6224; GFX6-NEXT: s_mul_i32 s0, s0, s3 6225; GFX6-NEXT: s_sub_i32 s0, s2, s0 6226; GFX6-NEXT: s_sub_i32 s1, s0, s3 6227; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 6228; GFX6-NEXT: s_cmp_ge_u32 s0, s3 6229; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6230; GFX6-NEXT: s_cselect_b32 s0, s1, s0 6231; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6232; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 6233; GFX6-NEXT: s_cmp_ge_u32 s0, s3 6234; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6235; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6236; GFX6-NEXT: s_xor_b32 s0, s9, s8 6237; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 6238; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 6239; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6240; GFX6-NEXT: s_endpgm 6241; 6242; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 6243; GFX9: ; %bb.0: 6244; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6245; GFX9-NEXT: v_mov_b32_e32 v1, 0 6246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6247; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6248; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6249; GFX9-NEXT: s_add_i32 s3, s3, s4 6250; GFX9-NEXT: s_xor_b32 s3, s3, s4 6251; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6252; GFX9-NEXT: s_sub_i32 s6, 0, s3 6253; GFX9-NEXT: s_ashr_i32 s5, s2, 31 6254; GFX9-NEXT: s_add_i32 s2, s2, s5 6255; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6256; GFX9-NEXT: s_xor_b32 s2, s2, s5 6257; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6258; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6259; GFX9-NEXT: v_readfirstlane_b32 s7, v0 6260; GFX9-NEXT: s_mul_i32 s6, s6, s7 6261; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 6262; GFX9-NEXT: s_add_i32 s7, s7, s6 6263; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 6264; GFX9-NEXT: s_mul_i32 s8, s6, s3 6265; GFX9-NEXT: s_sub_i32 s2, s2, s8 6266; GFX9-NEXT: s_add_i32 s7, s6, 1 6267; GFX9-NEXT: s_sub_i32 s8, s2, s3 6268; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6269; GFX9-NEXT: s_cselect_b32 s6, s7, s6 6270; GFX9-NEXT: s_cselect_b32 s2, s8, s2 6271; GFX9-NEXT: s_add_i32 s7, s6, 1 6272; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6273; GFX9-NEXT: s_cselect_b32 s2, s7, s6 6274; GFX9-NEXT: s_xor_b32 s3, s5, s4 6275; GFX9-NEXT: s_xor_b32 s2, s2, s3 6276; GFX9-NEXT: s_sub_i32 s2, s2, s3 6277; GFX9-NEXT: v_mov_b32_e32 v0, s2 6278; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6279; GFX9-NEXT: s_endpgm 6280 %shl.y = shl i32 4096, %y 6281 %r = sdiv i32 %x, %shl.y 6282 store i32 %r, ptr addrspace(1) %out 6283 ret void 6284} 6285 6286define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 6287; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 6288; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6289; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6290; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 6291; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6292; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 6293; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6294; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 6295; CHECK-NEXT: ret void 6296; 6297; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 6298; GFX6: ; %bb.0: 6299; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6300; GFX6-NEXT: s_mov_b32 s7, 0xf000 6301; GFX6-NEXT: s_mov_b32 s6, -1 6302; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6303; GFX6-NEXT: s_mov_b32 s4, s0 6304; GFX6-NEXT: s_mov_b32 s5, s1 6305; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6306; GFX6-NEXT: s_ashr_i32 s1, s3, 31 6307; GFX6-NEXT: s_lshr_b32 s0, s0, 20 6308; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6309; GFX6-NEXT: s_add_i32 s0, s2, s0 6310; GFX6-NEXT: s_add_i32 s1, s3, s1 6311; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6312; GFX6-NEXT: s_ashr_i32 s1, s1, 12 6313; GFX6-NEXT: v_mov_b32_e32 v0, s0 6314; GFX6-NEXT: v_mov_b32_e32 v1, s1 6315; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6316; GFX6-NEXT: s_endpgm 6317; 6318; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 6319; GFX9: ; %bb.0: 6320; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6321; GFX9-NEXT: v_mov_b32_e32 v2, 0 6322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6323; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6324; GFX9-NEXT: s_ashr_i32 s5, s3, 31 6325; GFX9-NEXT: s_lshr_b32 s4, s4, 20 6326; GFX9-NEXT: s_lshr_b32 s5, s5, 20 6327; GFX9-NEXT: s_add_i32 s2, s2, s4 6328; GFX9-NEXT: s_add_i32 s3, s3, s5 6329; GFX9-NEXT: s_ashr_i32 s2, s2, 12 6330; GFX9-NEXT: s_ashr_i32 s3, s3, 12 6331; GFX9-NEXT: v_mov_b32_e32 v0, s2 6332; GFX9-NEXT: v_mov_b32_e32 v1, s3 6333; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6334; GFX9-NEXT: s_endpgm 6335 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 6336 store <2 x i32> %r, ptr addrspace(1) %out 6337 ret void 6338} 6339 6340define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 6341; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 6342; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6343; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6344; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 6345; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6346; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 6347; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6348; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 6349; CHECK-NEXT: ret void 6350; 6351; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6352; GFX6: ; %bb.0: 6353; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6354; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 6355; GFX6-NEXT: s_mov_b32 s7, 0xf000 6356; GFX6-NEXT: s_mov_b32 s6, -1 6357; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6358; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 6359; GFX6-NEXT: s_mov_b32 s4, s0 6360; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6361; GFX6-NEXT: s_lshr_b32 s0, s0, 20 6362; GFX6-NEXT: s_add_i32 s0, s2, s0 6363; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 6364; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6365; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6366; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 6367; GFX6-NEXT: s_mov_b32 s5, s1 6368; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 6369; GFX6-NEXT: v_mov_b32_e32 v0, s0 6370; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6371; GFX6-NEXT: s_endpgm 6372; 6373; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6374; GFX9: ; %bb.0: 6375; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6376; GFX9-NEXT: v_mov_b32_e32 v2, 0 6377; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6378; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6379; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 6380; GFX9-NEXT: s_lshr_b32 s4, s4, 20 6381; GFX9-NEXT: s_add_i32 s5, s5, s3 6382; GFX9-NEXT: s_add_i32 s2, s2, s4 6383; GFX9-NEXT: s_lshr_b32 s3, s5, 31 6384; GFX9-NEXT: s_ashr_i32 s4, s5, 11 6385; GFX9-NEXT: s_ashr_i32 s2, s2, 12 6386; GFX9-NEXT: s_add_i32 s4, s4, s3 6387; GFX9-NEXT: v_mov_b32_e32 v0, s2 6388; GFX9-NEXT: v_mov_b32_e32 v1, s4 6389; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6390; GFX9-NEXT: s_endpgm 6391 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 6392 store <2 x i32> %r, ptr addrspace(1) %out 6393 ret void 6394} 6395 6396define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) { 6397; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 6398; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]] 6399; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6400; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6401; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6402; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6403; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6404; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 6405; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 6406; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 6407; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 6408; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 6409; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 6410; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 6411; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 6412; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 6413; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 6414; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 6415; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 6416; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 6417; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 6418; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 6419; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 6420; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 6421; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 6422; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 6423; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 6424; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 6425; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 6426; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 6427; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 6428; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 6429; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 6430; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 6431; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 6432; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 6433; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 6434; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 6435; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 6436; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 6437; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 6438; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 6439; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> poison, i32 [[TMP40]], i64 0 6440; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 6441; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6442; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 6443; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 6444; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 6445; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 6446; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 6447; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 6448; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 6449; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 6450; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 6451; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 6452; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 6453; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 6454; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 6455; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 6456; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 6457; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 6458; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 6459; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 6460; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 6461; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 6462; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 6463; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 6464; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 6465; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 6466; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 6467; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 6468; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 6469; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 6470; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 6471; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 6472; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 6473; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 6474; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 6475; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 6476; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 6477; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 6478; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 6479; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 6480; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 6481; CHECK-NEXT: store <2 x i32> [[TMP82]], ptr addrspace(1) [[OUT:%.*]], align 8 6482; CHECK-NEXT: ret void 6483; 6484; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 6485; GFX6: ; %bb.0: 6486; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6487; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 6488; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6489; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 6490; GFX6-NEXT: s_abs_i32 s6, s2 6491; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 6492; GFX6-NEXT: s_sub_i32 s7, 0, s6 6493; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6494; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6495; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6496; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6497; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 6498; GFX6-NEXT: s_abs_i32 s7, s0 6499; GFX6-NEXT: s_xor_b32 s0, s0, s2 6500; GFX6-NEXT: s_ashr_i32 s0, s0, 31 6501; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6502; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6503; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 6504; GFX6-NEXT: v_readfirstlane_b32 s2, v0 6505; GFX6-NEXT: s_mul_i32 s2, s2, s6 6506; GFX6-NEXT: s_sub_i32 s2, s7, s2 6507; GFX6-NEXT: s_sub_i32 s7, s2, s6 6508; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 6509; GFX6-NEXT: s_cmp_ge_u32 s2, s6 6510; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6511; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6512; GFX6-NEXT: s_cselect_b32 s2, s7, s2 6513; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0 6514; GFX6-NEXT: s_cmp_ge_u32 s2, s6 6515; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6516; GFX6-NEXT: s_abs_i32 s2, s3 6517; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 6518; GFX6-NEXT: s_sub_i32 s6, 0, s2 6519; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6520; GFX6-NEXT: s_xor_b32 s3, s1, s3 6521; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6522; GFX6-NEXT: s_abs_i32 s1, s1 6523; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 6524; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 6525; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 6526; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 6527; GFX6-NEXT: s_ashr_i32 s3, s3, 31 6528; GFX6-NEXT: s_mov_b32 s7, 0xf000 6529; GFX6-NEXT: v_mul_lo_u32 v3, s6, v2 6530; GFX6-NEXT: s_mov_b32 s6, -1 6531; GFX6-NEXT: v_mul_hi_u32 v1, v2, v3 6532; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6533; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6534; GFX6-NEXT: v_readfirstlane_b32 s0, v1 6535; GFX6-NEXT: s_mul_i32 s0, s0, s2 6536; GFX6-NEXT: s_sub_i32 s0, s1, s0 6537; GFX6-NEXT: s_sub_i32 s1, s0, s2 6538; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 6539; GFX6-NEXT: s_cmp_ge_u32 s0, s2 6540; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6541; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6542; GFX6-NEXT: s_cselect_b32 s0, s1, s0 6543; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v1 6544; GFX6-NEXT: s_cmp_ge_u32 s0, s2 6545; GFX6-NEXT: s_cselect_b64 vcc, -1, 0 6546; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6547; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 6548; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v1 6549; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6550; GFX6-NEXT: s_endpgm 6551; 6552; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 6553; GFX9: ; %bb.0: 6554; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6555; GFX9-NEXT: v_mov_b32_e32 v2, 0 6556; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6557; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 6558; GFX9-NEXT: s_abs_i32 s6, s2 6559; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 6560; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s3 6561; GFX9-NEXT: s_abs_i32 s3, s0 6562; GFX9-NEXT: s_xor_b32 s0, s0, s2 6563; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6564; GFX9-NEXT: s_sub_i32 s2, 0, s6 6565; GFX9-NEXT: s_ashr_i32 s0, s0, 31 6566; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6567; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6568; GFX9-NEXT: v_readfirstlane_b32 s8, v0 6569; GFX9-NEXT: s_mul_i32 s2, s2, s8 6570; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2 6571; GFX9-NEXT: s_add_i32 s8, s8, s2 6572; GFX9-NEXT: s_mul_hi_u32 s2, s3, s8 6573; GFX9-NEXT: s_mul_i32 s8, s2, s6 6574; GFX9-NEXT: s_sub_i32 s3, s3, s8 6575; GFX9-NEXT: s_add_i32 s9, s2, 1 6576; GFX9-NEXT: s_sub_i32 s8, s3, s6 6577; GFX9-NEXT: s_cmp_ge_u32 s3, s6 6578; GFX9-NEXT: s_cselect_b32 s2, s9, s2 6579; GFX9-NEXT: s_cselect_b32 s3, s8, s3 6580; GFX9-NEXT: s_add_i32 s8, s2, 1 6581; GFX9-NEXT: s_cmp_ge_u32 s3, s6 6582; GFX9-NEXT: s_cselect_b32 s6, s8, s2 6583; GFX9-NEXT: s_abs_i32 s8, s7 6584; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 6585; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 6586; GFX9-NEXT: s_xor_b32 s5, s6, s0 6587; GFX9-NEXT: s_sub_i32 s6, 0, s8 6588; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6589; GFX9-NEXT: s_sub_i32 s0, s5, s0 6590; GFX9-NEXT: s_xor_b32 s4, s1, s7 6591; GFX9-NEXT: s_abs_i32 s1, s1 6592; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6593; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6594; GFX9-NEXT: s_ashr_i32 s4, s4, 31 6595; GFX9-NEXT: v_readfirstlane_b32 s5, v0 6596; GFX9-NEXT: s_mul_i32 s6, s6, s5 6597; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6 6598; GFX9-NEXT: s_add_i32 s5, s5, s6 6599; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5 6600; GFX9-NEXT: s_mul_i32 s6, s5, s8 6601; GFX9-NEXT: s_sub_i32 s1, s1, s6 6602; GFX9-NEXT: s_add_i32 s7, s5, 1 6603; GFX9-NEXT: s_sub_i32 s6, s1, s8 6604; GFX9-NEXT: s_cmp_ge_u32 s1, s8 6605; GFX9-NEXT: s_cselect_b32 s5, s7, s5 6606; GFX9-NEXT: s_cselect_b32 s1, s6, s1 6607; GFX9-NEXT: s_add_i32 s6, s5, 1 6608; GFX9-NEXT: s_cmp_ge_u32 s1, s8 6609; GFX9-NEXT: s_cselect_b32 s1, s6, s5 6610; GFX9-NEXT: s_xor_b32 s1, s1, s4 6611; GFX9-NEXT: s_sub_i32 s1, s1, s4 6612; GFX9-NEXT: v_mov_b32_e32 v0, s0 6613; GFX9-NEXT: v_mov_b32_e32 v1, s1 6614; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6615; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6616; GFX9-NEXT: s_endpgm 6617 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6618 %r = sdiv <2 x i32> %x, %shl.y 6619 store <2 x i32> %r, ptr addrspace(1) %out 6620 ret void 6621} 6622 6623define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) { 6624; CHECK-LABEL: @srem_i32_oddk_denom( 6625; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 6626; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6627; CHECK-NEXT: ret void 6628; 6629; GFX6-LABEL: srem_i32_oddk_denom: 6630; GFX6: ; %bb.0: 6631; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 6632; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6633; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6634; GFX6-NEXT: s_mov_b32 s3, 0xf000 6635; GFX6-NEXT: s_mov_b32 s2, -1 6636; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6637; GFX6-NEXT: v_mul_hi_i32 v0, s6, v0 6638; GFX6-NEXT: v_readfirstlane_b32 s4, v0 6639; GFX6-NEXT: s_add_i32 s4, s4, s6 6640; GFX6-NEXT: s_lshr_b32 s5, s4, 31 6641; GFX6-NEXT: s_ashr_i32 s4, s4, 20 6642; GFX6-NEXT: s_add_i32 s4, s4, s5 6643; GFX6-NEXT: s_mul_i32 s4, s4, 0x12d8fb 6644; GFX6-NEXT: s_sub_i32 s4, s6, s4 6645; GFX6-NEXT: v_mov_b32_e32 v0, s4 6646; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6647; GFX6-NEXT: s_endpgm 6648; 6649; GFX9-LABEL: srem_i32_oddk_denom: 6650; GFX9: ; %bb.0: 6651; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 6652; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6653; GFX9-NEXT: v_mov_b32_e32 v0, 0 6654; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6655; GFX9-NEXT: s_mul_hi_i32 s3, s2, 0xd9528441 6656; GFX9-NEXT: s_add_i32 s3, s3, s2 6657; GFX9-NEXT: s_lshr_b32 s4, s3, 31 6658; GFX9-NEXT: s_ashr_i32 s3, s3, 20 6659; GFX9-NEXT: s_add_i32 s3, s3, s4 6660; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb 6661; GFX9-NEXT: s_sub_i32 s2, s2, s3 6662; GFX9-NEXT: v_mov_b32_e32 v1, s2 6663; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 6664; GFX9-NEXT: s_endpgm 6665 %r = srem i32 %x, 1235195 6666 store i32 %r, ptr addrspace(1) %out 6667 ret void 6668} 6669 6670define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) { 6671; CHECK-LABEL: @srem_i32_pow2k_denom( 6672; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 6673; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6674; CHECK-NEXT: ret void 6675; 6676; GFX6-LABEL: srem_i32_pow2k_denom: 6677; GFX6: ; %bb.0: 6678; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 6679; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 6680; GFX6-NEXT: s_mov_b32 s3, 0xf000 6681; GFX6-NEXT: s_mov_b32 s2, -1 6682; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6683; GFX6-NEXT: s_ashr_i32 s4, s6, 31 6684; GFX6-NEXT: s_lshr_b32 s4, s4, 20 6685; GFX6-NEXT: s_add_i32 s4, s6, s4 6686; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 6687; GFX6-NEXT: s_sub_i32 s4, s6, s4 6688; GFX6-NEXT: v_mov_b32_e32 v0, s4 6689; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6690; GFX6-NEXT: s_endpgm 6691; 6692; GFX9-LABEL: srem_i32_pow2k_denom: 6693; GFX9: ; %bb.0: 6694; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 6695; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 6696; GFX9-NEXT: v_mov_b32_e32 v0, 0 6697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6698; GFX9-NEXT: s_ashr_i32 s3, s2, 31 6699; GFX9-NEXT: s_lshr_b32 s3, s3, 20 6700; GFX9-NEXT: s_add_i32 s3, s2, s3 6701; GFX9-NEXT: s_and_b32 s3, s3, 0xfffff000 6702; GFX9-NEXT: s_sub_i32 s2, s2, s3 6703; GFX9-NEXT: v_mov_b32_e32 v1, s2 6704; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 6705; GFX9-NEXT: s_endpgm 6706 %r = srem i32 %x, 4096 6707 store i32 %r, ptr addrspace(1) %out 6708 ret void 6709} 6710 6711define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x, i32 %y) { 6712; CHECK-LABEL: @srem_i32_pow2_shl_denom( 6713; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6714; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 6715; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4 6716; CHECK-NEXT: ret void 6717; 6718; GFX6-LABEL: srem_i32_pow2_shl_denom: 6719; GFX6: ; %bb.0: 6720; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6721; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6722; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6723; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6724; GFX6-NEXT: s_add_i32 s3, s3, s4 6725; GFX6-NEXT: s_xor_b32 s4, s3, s4 6726; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 6727; GFX6-NEXT: s_sub_i32 s3, 0, s4 6728; GFX6-NEXT: s_ashr_i32 s5, s2, 31 6729; GFX6-NEXT: s_add_i32 s2, s2, s5 6730; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6731; GFX6-NEXT: s_xor_b32 s6, s2, s5 6732; GFX6-NEXT: s_mov_b32 s2, -1 6733; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6734; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6735; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6736; GFX6-NEXT: s_mov_b32 s3, 0xf000 6737; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6738; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6739; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6740; GFX6-NEXT: v_readfirstlane_b32 s7, v0 6741; GFX6-NEXT: s_mul_i32 s7, s7, s4 6742; GFX6-NEXT: s_sub_i32 s6, s6, s7 6743; GFX6-NEXT: s_sub_i32 s7, s6, s4 6744; GFX6-NEXT: s_cmp_ge_u32 s6, s4 6745; GFX6-NEXT: s_cselect_b32 s6, s7, s6 6746; GFX6-NEXT: s_sub_i32 s7, s6, s4 6747; GFX6-NEXT: s_cmp_ge_u32 s6, s4 6748; GFX6-NEXT: s_cselect_b32 s4, s7, s6 6749; GFX6-NEXT: s_xor_b32 s4, s4, s5 6750; GFX6-NEXT: s_sub_i32 s4, s4, s5 6751; GFX6-NEXT: v_mov_b32_e32 v0, s4 6752; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6753; GFX6-NEXT: s_endpgm 6754; 6755; GFX9-LABEL: srem_i32_pow2_shl_denom: 6756; GFX9: ; %bb.0: 6757; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6758; GFX9-NEXT: v_mov_b32_e32 v1, 0 6759; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6760; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6761; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6762; GFX9-NEXT: s_add_i32 s3, s3, s4 6763; GFX9-NEXT: s_xor_b32 s3, s3, s4 6764; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6765; GFX9-NEXT: s_sub_i32 s5, 0, s3 6766; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6767; GFX9-NEXT: s_add_i32 s2, s2, s4 6768; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6769; GFX9-NEXT: s_xor_b32 s2, s2, s4 6770; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6771; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6772; GFX9-NEXT: v_readfirstlane_b32 s6, v0 6773; GFX9-NEXT: s_mul_i32 s5, s5, s6 6774; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 6775; GFX9-NEXT: s_add_i32 s6, s6, s5 6776; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 6777; GFX9-NEXT: s_mul_i32 s5, s5, s3 6778; GFX9-NEXT: s_sub_i32 s2, s2, s5 6779; GFX9-NEXT: s_sub_i32 s5, s2, s3 6780; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6781; GFX9-NEXT: s_cselect_b32 s2, s5, s2 6782; GFX9-NEXT: s_sub_i32 s5, s2, s3 6783; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6784; GFX9-NEXT: s_cselect_b32 s2, s5, s2 6785; GFX9-NEXT: s_xor_b32 s2, s2, s4 6786; GFX9-NEXT: s_sub_i32 s2, s2, s4 6787; GFX9-NEXT: v_mov_b32_e32 v0, s2 6788; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6789; GFX9-NEXT: s_endpgm 6790 %shl.y = shl i32 4096, %y 6791 %r = srem i32 %x, %shl.y 6792 store i32 %r, ptr addrspace(1) %out 6793 ret void 6794} 6795 6796define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i32> %x) { 6797; CHECK-LABEL: @srem_v2i32_pow2k_denom( 6798; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6799; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 6800; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i64 0 6801; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6802; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 6803; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6804; CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 8 6805; CHECK-NEXT: ret void 6806; 6807; GFX6-LABEL: srem_v2i32_pow2k_denom: 6808; GFX6: ; %bb.0: 6809; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 6810; GFX6-NEXT: s_mov_b32 s7, 0xf000 6811; GFX6-NEXT: s_mov_b32 s6, -1 6812; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6813; GFX6-NEXT: s_mov_b32 s4, s0 6814; GFX6-NEXT: s_mov_b32 s5, s1 6815; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6816; GFX6-NEXT: s_ashr_i32 s1, s3, 31 6817; GFX6-NEXT: s_lshr_b32 s0, s0, 20 6818; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6819; GFX6-NEXT: s_add_i32 s0, s2, s0 6820; GFX6-NEXT: s_add_i32 s1, s3, s1 6821; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 6822; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 6823; GFX6-NEXT: s_sub_i32 s0, s2, s0 6824; GFX6-NEXT: s_sub_i32 s1, s3, s1 6825; GFX6-NEXT: v_mov_b32_e32 v0, s0 6826; GFX6-NEXT: v_mov_b32_e32 v1, s1 6827; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6828; GFX6-NEXT: s_endpgm 6829; 6830; GFX9-LABEL: srem_v2i32_pow2k_denom: 6831; GFX9: ; %bb.0: 6832; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 6833; GFX9-NEXT: v_mov_b32_e32 v2, 0 6834; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6835; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6836; GFX9-NEXT: s_ashr_i32 s5, s3, 31 6837; GFX9-NEXT: s_lshr_b32 s4, s4, 20 6838; GFX9-NEXT: s_lshr_b32 s5, s5, 20 6839; GFX9-NEXT: s_add_i32 s4, s2, s4 6840; GFX9-NEXT: s_add_i32 s5, s3, s5 6841; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 6842; GFX9-NEXT: s_sub_i32 s2, s2, s4 6843; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 6844; GFX9-NEXT: s_sub_i32 s3, s3, s4 6845; GFX9-NEXT: v_mov_b32_e32 v0, s2 6846; GFX9-NEXT: v_mov_b32_e32 v1, s3 6847; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6848; GFX9-NEXT: s_endpgm 6849 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 6850 store <2 x i32> %r, ptr addrspace(1) %out 6851 ret void 6852} 6853 6854define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x i32> %x, <2 x i32> %y) { 6855; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 6856; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> splat (i32 4096), [[Y:%.*]] 6857; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6858; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6859; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6860; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6861; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 6862; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 6863; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 6864; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 6865; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 6866; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6867; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 6868; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 6869; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 6870; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 6871; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 6872; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 6873; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 6874; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 6875; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 6876; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 6877; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 6878; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 6879; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 6880; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 6881; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 6882; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 6883; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 6884; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 6885; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 6886; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 6887; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 6888; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 6889; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 6890; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 6891; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 6892; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 6893; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 6894; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> poison, i32 [[TMP37]], i64 0 6895; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 6896; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6897; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 6898; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 6899; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 6900; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 6901; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 6902; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 6903; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 6904; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 6905; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 6906; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 6907; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 6908; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 6909; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 6910; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 6911; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 6912; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 6913; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 6914; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 6915; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 6916; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 6917; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 6918; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 6919; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 6920; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 6921; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 6922; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 6923; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 6924; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 6925; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 6926; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 6927; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 6928; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 6929; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 6930; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 6931; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 6932; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 6933; CHECK-NEXT: store <2 x i32> [[TMP76]], ptr addrspace(1) [[OUT:%.*]], align 8 6934; CHECK-NEXT: ret void 6935; 6936; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 6937; GFX6: ; %bb.0: 6938; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 6939; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 6940; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6941; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s2 6942; GFX6-NEXT: s_abs_i32 s2, s2 6943; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6944; GFX6-NEXT: s_sub_i32 s6, 0, s2 6945; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6946; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6947; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6948; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6949; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 6950; GFX6-NEXT: s_abs_i32 s6, s0 6951; GFX6-NEXT: s_ashr_i32 s0, s0, 31 6952; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6953; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6954; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6955; GFX6-NEXT: v_readfirstlane_b32 s7, v0 6956; GFX6-NEXT: s_mul_i32 s7, s7, s2 6957; GFX6-NEXT: s_sub_i32 s6, s6, s7 6958; GFX6-NEXT: s_sub_i32 s7, s6, s2 6959; GFX6-NEXT: s_cmp_ge_u32 s6, s2 6960; GFX6-NEXT: s_cselect_b32 s6, s7, s6 6961; GFX6-NEXT: s_sub_i32 s7, s6, s2 6962; GFX6-NEXT: s_cmp_ge_u32 s6, s2 6963; GFX6-NEXT: s_cselect_b32 s2, s7, s6 6964; GFX6-NEXT: s_abs_i32 s3, s3 6965; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 6966; GFX6-NEXT: s_sub_i32 s6, 0, s3 6967; GFX6-NEXT: s_abs_i32 s8, s1 6968; GFX6-NEXT: s_xor_b32 s2, s2, s0 6969; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6970; GFX6-NEXT: s_sub_i32 s0, s2, s0 6971; GFX6-NEXT: s_ashr_i32 s1, s1, 31 6972; GFX6-NEXT: s_mov_b32 s7, 0xf000 6973; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6974; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6975; GFX6-NEXT: v_mul_lo_u32 v1, s6, v0 6976; GFX6-NEXT: s_mov_b32 s6, -1 6977; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6978; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6979; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 6980; GFX6-NEXT: v_readfirstlane_b32 s2, v0 6981; GFX6-NEXT: s_mul_i32 s2, s2, s3 6982; GFX6-NEXT: s_sub_i32 s2, s8, s2 6983; GFX6-NEXT: s_sub_i32 s8, s2, s3 6984; GFX6-NEXT: s_cmp_ge_u32 s2, s3 6985; GFX6-NEXT: s_cselect_b32 s2, s8, s2 6986; GFX6-NEXT: s_sub_i32 s8, s2, s3 6987; GFX6-NEXT: s_cmp_ge_u32 s2, s3 6988; GFX6-NEXT: s_cselect_b32 s2, s8, s2 6989; GFX6-NEXT: s_xor_b32 s2, s2, s1 6990; GFX6-NEXT: s_sub_i32 s1, s2, s1 6991; GFX6-NEXT: v_mov_b32_e32 v0, s0 6992; GFX6-NEXT: v_mov_b32_e32 v1, s1 6993; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6994; GFX6-NEXT: s_endpgm 6995; 6996; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 6997; GFX9: ; %bb.0: 6998; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 6999; GFX9-NEXT: v_mov_b32_e32 v2, 0 7000; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7001; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s2 7002; GFX9-NEXT: s_abs_i32 s2, s2 7003; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 7004; GFX9-NEXT: s_sub_i32 s7, 0, s2 7005; GFX9-NEXT: s_ashr_i32 s6, s0, 31 7006; GFX9-NEXT: s_abs_i32 s0, s0 7007; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7008; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 7009; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 7010; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7011; GFX9-NEXT: v_readfirstlane_b32 s8, v0 7012; GFX9-NEXT: s_mul_i32 s7, s7, s8 7013; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 7014; GFX9-NEXT: s_add_i32 s8, s8, s7 7015; GFX9-NEXT: s_mul_hi_u32 s7, s0, s8 7016; GFX9-NEXT: s_mul_i32 s7, s7, s2 7017; GFX9-NEXT: s_sub_i32 s0, s0, s7 7018; GFX9-NEXT: s_sub_i32 s7, s0, s2 7019; GFX9-NEXT: s_cmp_ge_u32 s0, s2 7020; GFX9-NEXT: s_cselect_b32 s0, s7, s0 7021; GFX9-NEXT: s_sub_i32 s7, s0, s2 7022; GFX9-NEXT: s_cmp_ge_u32 s0, s2 7023; GFX9-NEXT: s_cselect_b32 s0, s7, s0 7024; GFX9-NEXT: s_abs_i32 s7, s3 7025; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 7026; GFX9-NEXT: s_xor_b32 s0, s0, s6 7027; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 7028; GFX9-NEXT: s_sub_i32 s5, 0, s7 7029; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7030; GFX9-NEXT: s_sub_i32 s0, s0, s6 7031; GFX9-NEXT: s_ashr_i32 s4, s1, 31 7032; GFX9-NEXT: s_abs_i32 s1, s1 7033; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 7034; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7035; GFX9-NEXT: v_readfirstlane_b32 s6, v0 7036; GFX9-NEXT: s_mul_i32 s5, s5, s6 7037; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 7038; GFX9-NEXT: s_add_i32 s6, s6, s5 7039; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6 7040; GFX9-NEXT: s_mul_i32 s5, s5, s7 7041; GFX9-NEXT: s_sub_i32 s1, s1, s5 7042; GFX9-NEXT: s_sub_i32 s5, s1, s7 7043; GFX9-NEXT: s_cmp_ge_u32 s1, s7 7044; GFX9-NEXT: s_cselect_b32 s1, s5, s1 7045; GFX9-NEXT: s_sub_i32 s5, s1, s7 7046; GFX9-NEXT: s_cmp_ge_u32 s1, s7 7047; GFX9-NEXT: s_cselect_b32 s1, s5, s1 7048; GFX9-NEXT: s_xor_b32 s1, s1, s4 7049; GFX9-NEXT: s_sub_i32 s1, s1, s4 7050; GFX9-NEXT: v_mov_b32_e32 v0, s0 7051; GFX9-NEXT: v_mov_b32_e32 v1, s1 7052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7053; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7054; GFX9-NEXT: s_endpgm 7055 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7056 %r = srem <2 x i32> %x, %shl.y 7057 store <2 x i32> %r, ptr addrspace(1) %out 7058 ret void 7059} 7060 7061define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { 7062; CHECK-LABEL: @udiv_i64_oddk_denom( 7063; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 7064; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7065; CHECK-NEXT: ret void 7066; 7067; GFX6-LABEL: udiv_i64_oddk_denom: 7068; GFX6: ; %bb.0: 7069; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7070; GFX6-NEXT: v_mov_b32_e32 v1, 0x64c139ef 7071; GFX6-NEXT: v_mov_b32_e32 v0, 0x38f83e5 7072; GFX6-NEXT: s_mov_b32 s7, 0xf000 7073; GFX6-NEXT: s_mov_b32 s6, -1 7074; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7075; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7076; GFX6-NEXT: v_mul_hi_u32 v3, s3, v1 7077; GFX6-NEXT: s_mov_b32 s5, s1 7078; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0 7079; GFX6-NEXT: s_mul_i32 s1, s3, 0x64c139ef 7080; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v4 7081; GFX6-NEXT: s_mov_b32 s4, s0 7082; GFX6-NEXT: s_mul_i32 s0, s2, 0x38f83e5 7083; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 7084; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v4 7085; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 7086; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7087; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7088; GFX6-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc 7089; GFX6-NEXT: s_mul_i32 s0, s3, 0x38f83e5 7090; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v2 7091; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc 7092; GFX6-NEXT: v_mov_b32_e32 v1, 0 7093; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v0 7094; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7095; GFX6-NEXT: s_endpgm 7096; 7097; GFX9-LABEL: udiv_i64_oddk_denom: 7098; GFX9: ; %bb.0: 7099; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7100; GFX9-NEXT: v_mov_b32_e32 v1, 0 7101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7102; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x38f83e5 7103; GFX9-NEXT: s_mul_i32 s5, s2, 0x38f83e5 7104; GFX9-NEXT: s_mul_i32 s7, s3, 0x64c139ef 7105; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0x64c139ef 7106; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x64c139ef 7107; GFX9-NEXT: s_add_u32 s2, s7, s2 7108; GFX9-NEXT: s_addc_u32 s6, s6, 0 7109; GFX9-NEXT: s_add_u32 s2, s5, s2 7110; GFX9-NEXT: s_addc_u32 s2, s4, 0 7111; GFX9-NEXT: s_add_u32 s2, s6, s2 7112; GFX9-NEXT: s_addc_u32 s4, 0, 0 7113; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x38f83e5 7114; GFX9-NEXT: s_mul_i32 s3, s3, 0x38f83e5 7115; GFX9-NEXT: s_add_u32 s2, s3, s2 7116; GFX9-NEXT: s_addc_u32 s2, s5, s4 7117; GFX9-NEXT: s_lshr_b32 s2, s2, 2 7118; GFX9-NEXT: v_mov_b32_e32 v0, s2 7119; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 7120; GFX9-NEXT: s_endpgm 7121 %r = udiv i64 %x, 1235195949943 7122 store i64 %r, ptr addrspace(1) %out 7123 ret void 7124} 7125 7126define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { 7127; CHECK-LABEL: @udiv_i64_pow2k_denom( 7128; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 7129; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7130; CHECK-NEXT: ret void 7131; 7132; GFX6-LABEL: udiv_i64_pow2k_denom: 7133; GFX6: ; %bb.0: 7134; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7135; GFX6-NEXT: s_mov_b32 s7, 0xf000 7136; GFX6-NEXT: s_mov_b32 s6, -1 7137; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7138; GFX6-NEXT: s_mov_b32 s4, s0 7139; GFX6-NEXT: s_mov_b32 s5, s1 7140; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 7141; GFX6-NEXT: v_mov_b32_e32 v0, s0 7142; GFX6-NEXT: v_mov_b32_e32 v1, s1 7143; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7144; GFX6-NEXT: s_endpgm 7145; 7146; GFX9-LABEL: udiv_i64_pow2k_denom: 7147; GFX9: ; %bb.0: 7148; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7149; GFX9-NEXT: v_mov_b32_e32 v2, 0 7150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7151; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7152; GFX9-NEXT: v_mov_b32_e32 v0, s2 7153; GFX9-NEXT: v_mov_b32_e32 v1, s3 7154; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7155; GFX9-NEXT: s_endpgm 7156 %r = udiv i64 %x, 4096 7157 store i64 %r, ptr addrspace(1) %out 7158 ret void 7159} 7160 7161define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) { 7162; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 7163; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7164; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 7165; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7166; CHECK-NEXT: ret void 7167; 7168; GFX6-LABEL: udiv_i64_pow2_shl_denom: 7169; GFX6: ; %bb.0: 7170; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7171; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd 7172; GFX6-NEXT: s_mov_b32 s7, 0xf000 7173; GFX6-NEXT: s_mov_b32 s6, -1 7174; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7175; GFX6-NEXT: s_mov_b32 s4, s0 7176; GFX6-NEXT: s_add_i32 s8, s8, 12 7177; GFX6-NEXT: s_mov_b32 s5, s1 7178; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 7179; GFX6-NEXT: v_mov_b32_e32 v0, s0 7180; GFX6-NEXT: v_mov_b32_e32 v1, s1 7181; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7182; GFX6-NEXT: s_endpgm 7183; 7184; GFX9-LABEL: udiv_i64_pow2_shl_denom: 7185; GFX9: ; %bb.0: 7186; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 7187; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7188; GFX9-NEXT: v_mov_b32_e32 v2, 0 7189; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7190; GFX9-NEXT: s_add_i32 s6, s6, 12 7191; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 7192; GFX9-NEXT: v_mov_b32_e32 v0, s2 7193; GFX9-NEXT: v_mov_b32_e32 v1, s3 7194; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7195; GFX9-NEXT: s_endpgm 7196 %shl.y = shl i64 4096, %y 7197 %r = udiv i64 %x, %shl.y 7198 store i64 %r, ptr addrspace(1) %out 7199 ret void 7200} 7201 7202define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 7203; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 7204; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7205; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7206; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 7207; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7208; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 7209; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7210; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 7211; CHECK-NEXT: ret void 7212; 7213; GFX6-LABEL: udiv_v2i64_pow2k_denom: 7214; GFX6: ; %bb.0: 7215; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 7216; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 7217; GFX6-NEXT: s_mov_b32 s7, 0xf000 7218; GFX6-NEXT: s_mov_b32 s6, -1 7219; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7220; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 7221; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7222; GFX6-NEXT: v_mov_b32_e32 v0, s0 7223; GFX6-NEXT: v_mov_b32_e32 v1, s1 7224; GFX6-NEXT: v_mov_b32_e32 v2, s2 7225; GFX6-NEXT: v_mov_b32_e32 v3, s3 7226; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7227; GFX6-NEXT: s_endpgm 7228; 7229; GFX9-LABEL: udiv_v2i64_pow2k_denom: 7230; GFX9: ; %bb.0: 7231; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 7232; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 7233; GFX9-NEXT: v_mov_b32_e32 v4, 0 7234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7235; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 7236; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7237; GFX9-NEXT: v_mov_b32_e32 v0, s0 7238; GFX9-NEXT: v_mov_b32_e32 v1, s1 7239; GFX9-NEXT: v_mov_b32_e32 v2, s2 7240; GFX9-NEXT: v_mov_b32_e32 v3, s3 7241; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 7242; GFX9-NEXT: s_endpgm 7243 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 7244 store <2 x i64> %r, ptr addrspace(1) %out 7245 ret void 7246} 7247 7248define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 7249; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 7250; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7251; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7252; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 7253; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7254; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 7255; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7256; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 7257; CHECK-NEXT: ret void 7258; 7259; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 7260; GFX6: ; %bb.0: 7261; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 7262; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7263; GFX6-NEXT: v_mov_b32_e32 v2, 0x10010011 7264; GFX6-NEXT: v_mov_b32_e32 v0, 0x100100 7265; GFX6-NEXT: s_mov_b32 s3, 0xf000 7266; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7267; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 7268; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 7269; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 7270; GFX6-NEXT: s_mul_i32 s7, s11, 0x10010011 7271; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3 7272; GFX6-NEXT: s_mul_i32 s6, s10, 0x100100 7273; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 7274; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3 7275; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7276; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7277; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 7278; GFX6-NEXT: v_addc_u32_e64 v2, s[6:7], 0, 0, vcc 7279; GFX6-NEXT: s_mul_i32 s6, s11, 0x100100 7280; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v1 7281; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc 7282; GFX6-NEXT: v_mov_b32_e32 v1, s11 7283; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v3 7284; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 7285; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 7286; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], 12 7287; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 7288; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc 7289; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 11 7290; GFX6-NEXT: s_mov_b32 s2, -1 7291; GFX6-NEXT: v_mov_b32_e32 v0, s4 7292; GFX6-NEXT: v_mov_b32_e32 v1, s5 7293; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7294; GFX6-NEXT: s_endpgm 7295; 7296; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 7297; GFX9: ; %bb.0: 7298; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 7299; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 7300; GFX9-NEXT: v_mov_b32_e32 v4, 0 7301; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7302; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 7303; GFX9-NEXT: s_mul_i32 s9, s3, 0x10010011 7304; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x10010011 7305; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x10010011 7306; GFX9-NEXT: s_add_u32 s9, s9, s10 7307; GFX9-NEXT: s_mul_i32 s5, s2, 0x100100 7308; GFX9-NEXT: s_addc_u32 s8, s8, 0 7309; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x100100 7310; GFX9-NEXT: s_add_u32 s5, s5, s9 7311; GFX9-NEXT: s_addc_u32 s4, s4, 0 7312; GFX9-NEXT: s_add_u32 s4, s8, s4 7313; GFX9-NEXT: s_addc_u32 s5, 0, 0 7314; GFX9-NEXT: s_mul_i32 s9, s3, 0x100100 7315; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x100100 7316; GFX9-NEXT: s_add_u32 s4, s9, s4 7317; GFX9-NEXT: s_addc_u32 s5, s8, s5 7318; GFX9-NEXT: s_sub_u32 s2, s2, s4 7319; GFX9-NEXT: s_subb_u32 s3, s3, s5 7320; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 7321; GFX9-NEXT: s_add_u32 s2, s2, s4 7322; GFX9-NEXT: s_addc_u32 s3, s3, s5 7323; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 11 7324; GFX9-NEXT: v_mov_b32_e32 v0, s0 7325; GFX9-NEXT: v_mov_b32_e32 v1, s1 7326; GFX9-NEXT: v_mov_b32_e32 v2, s2 7327; GFX9-NEXT: v_mov_b32_e32 v3, s3 7328; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 7329; GFX9-NEXT: s_endpgm 7330 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 7331 store <2 x i64> %r, ptr addrspace(1) %out 7332 ret void 7333} 7334 7335define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { 7336; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 7337; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]] 7338; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7339; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7340; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 7341; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 7342; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7343; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7344; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 7345; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7346; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16 7347; CHECK-NEXT: ret void 7348; 7349; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 7350; GFX6: ; %bb.0: 7351; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 7352; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7353; GFX6-NEXT: s_mov_b32 s3, 0xf000 7354; GFX6-NEXT: s_mov_b32 s2, -1 7355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7356; GFX6-NEXT: s_add_i32 s4, s12, 12 7357; GFX6-NEXT: s_add_i32 s6, s14, 12 7358; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 7359; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 7360; GFX6-NEXT: v_mov_b32_e32 v0, s4 7361; GFX6-NEXT: v_mov_b32_e32 v1, s5 7362; GFX6-NEXT: v_mov_b32_e32 v2, s6 7363; GFX6-NEXT: v_mov_b32_e32 v3, s7 7364; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7365; GFX6-NEXT: s_endpgm 7366; 7367; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 7368; GFX9: ; %bb.0: 7369; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 7370; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7371; GFX9-NEXT: v_mov_b32_e32 v4, 0 7372; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7373; GFX9-NEXT: s_add_i32 s2, s12, 12 7374; GFX9-NEXT: s_add_i32 s4, s14, 12 7375; GFX9-NEXT: s_lshr_b64 s[2:3], s[8:9], s2 7376; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 7377; GFX9-NEXT: v_mov_b32_e32 v0, s2 7378; GFX9-NEXT: v_mov_b32_e32 v1, s3 7379; GFX9-NEXT: v_mov_b32_e32 v2, s4 7380; GFX9-NEXT: v_mov_b32_e32 v3, s5 7381; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 7382; GFX9-NEXT: s_endpgm 7383 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7384 %r = udiv <2 x i64> %x, %shl.y 7385 store <2 x i64> %r, ptr addrspace(1) %out 7386 ret void 7387} 7388 7389define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { 7390; CHECK-LABEL: @urem_i64_oddk_denom( 7391; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 7392; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7393; CHECK-NEXT: ret void 7394; 7395; GFX6-LABEL: urem_i64_oddk_denom: 7396; GFX6: ; %bb.0: 7397; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7398; GFX6-NEXT: v_mov_b32_e32 v2, 0xf6841139 7399; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e10011 7400; GFX6-NEXT: s_mov_b32 s7, 0xf000 7401; GFX6-NEXT: s_mov_b32 s6, -1 7402; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7403; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 7404; GFX6-NEXT: v_mul_hi_u32 v2, s3, v2 7405; GFX6-NEXT: v_mul_hi_u32 v1, s2, v0 7406; GFX6-NEXT: s_mul_i32 s5, s3, 0xf6841139 7407; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v3 7408; GFX6-NEXT: s_mov_b32 s4, s0 7409; GFX6-NEXT: s_mul_i32 s0, s2, 0xe3e10011 7410; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 7411; GFX6-NEXT: v_add_i32_e32 v3, vcc, s0, v3 7412; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7413; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7414; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7415; GFX6-NEXT: s_mul_i32 s0, s3, 0xe3e10011 7416; GFX6-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc 7417; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 7418; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc 7419; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 7420; GFX6-NEXT: s_movk_i32 s0, 0x11f 7421; GFX6-NEXT: v_mul_lo_u32 v1, v0, s0 7422; GFX6-NEXT: s_mov_b32 s0, 0x9761f7c9 7423; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 7424; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 7425; GFX6-NEXT: s_mov_b32 s5, s1 7426; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 7427; GFX6-NEXT: v_mov_b32_e32 v2, s3 7428; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 7429; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 7430; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7431; GFX6-NEXT: s_endpgm 7432; 7433; GFX9-LABEL: urem_i64_oddk_denom: 7434; GFX9: ; %bb.0: 7435; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7436; GFX9-NEXT: v_mov_b32_e32 v2, 0 7437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7438; GFX9-NEXT: s_mul_i32 s7, s3, 0xf6841139 7439; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xf6841139 7440; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xf6841139 7441; GFX9-NEXT: s_add_u32 s7, s7, s8 7442; GFX9-NEXT: s_mul_i32 s5, s2, 0xe3e10011 7443; GFX9-NEXT: s_addc_u32 s6, s6, 0 7444; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xe3e10011 7445; GFX9-NEXT: s_add_u32 s5, s5, s7 7446; GFX9-NEXT: s_addc_u32 s4, s4, 0 7447; GFX9-NEXT: s_add_u32 s4, s6, s4 7448; GFX9-NEXT: s_addc_u32 s5, 0, 0 7449; GFX9-NEXT: s_mul_i32 s7, s3, 0xe3e10011 7450; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xe3e10011 7451; GFX9-NEXT: s_add_u32 s4, s7, s4 7452; GFX9-NEXT: s_addc_u32 s4, s6, s5 7453; GFX9-NEXT: s_lshr_b32 s4, s4, 8 7454; GFX9-NEXT: s_mul_i32 s5, s4, 0x11f 7455; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x9761f7c9 7456; GFX9-NEXT: s_add_i32 s6, s6, s5 7457; GFX9-NEXT: s_mul_i32 s4, s4, 0x9761f7c9 7458; GFX9-NEXT: s_sub_u32 s2, s2, s4 7459; GFX9-NEXT: s_subb_u32 s3, s3, s6 7460; GFX9-NEXT: v_mov_b32_e32 v0, s2 7461; GFX9-NEXT: v_mov_b32_e32 v1, s3 7462; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7463; GFX9-NEXT: s_endpgm 7464 %r = urem i64 %x, 1235195393993 7465 store i64 %r, ptr addrspace(1) %out 7466 ret void 7467} 7468 7469define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { 7470; CHECK-LABEL: @urem_i64_pow2k_denom( 7471; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 7472; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7473; CHECK-NEXT: ret void 7474; 7475; GFX6-LABEL: urem_i64_pow2k_denom: 7476; GFX6: ; %bb.0: 7477; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7478; GFX6-NEXT: s_mov_b32 s7, 0xf000 7479; GFX6-NEXT: s_mov_b32 s6, -1 7480; GFX6-NEXT: v_mov_b32_e32 v1, 0 7481; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7482; GFX6-NEXT: s_mov_b32 s4, s0 7483; GFX6-NEXT: s_and_b32 s0, s2, 0xfff 7484; GFX6-NEXT: s_mov_b32 s5, s1 7485; GFX6-NEXT: v_mov_b32_e32 v0, s0 7486; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7487; GFX6-NEXT: s_endpgm 7488; 7489; GFX9-LABEL: urem_i64_pow2k_denom: 7490; GFX9: ; %bb.0: 7491; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7492; GFX9-NEXT: v_mov_b32_e32 v1, 0 7493; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7494; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 7495; GFX9-NEXT: v_mov_b32_e32 v0, s2 7496; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 7497; GFX9-NEXT: s_endpgm 7498 %r = urem i64 %x, 4096 7499 store i64 %r, ptr addrspace(1) %out 7500 ret void 7501} 7502 7503define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) { 7504; CHECK-LABEL: @urem_i64_pow2_shl_denom( 7505; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7506; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 7507; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7508; CHECK-NEXT: ret void 7509; 7510; GFX6-LABEL: urem_i64_pow2_shl_denom: 7511; GFX6: ; %bb.0: 7512; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7513; GFX6-NEXT: s_load_dword s8, s[4:5], 0xd 7514; GFX6-NEXT: s_mov_b32 s7, 0xf000 7515; GFX6-NEXT: s_mov_b32 s6, -1 7516; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7517; GFX6-NEXT: s_mov_b32 s4, s0 7518; GFX6-NEXT: s_mov_b32 s5, s1 7519; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s8 7520; GFX6-NEXT: s_add_u32 s0, s0, -1 7521; GFX6-NEXT: s_addc_u32 s1, s1, -1 7522; GFX6-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] 7523; GFX6-NEXT: v_mov_b32_e32 v0, s0 7524; GFX6-NEXT: v_mov_b32_e32 v1, s1 7525; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7526; GFX6-NEXT: s_endpgm 7527; 7528; GFX9-LABEL: urem_i64_pow2_shl_denom: 7529; GFX9: ; %bb.0: 7530; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 7531; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7532; GFX9-NEXT: v_mov_b32_e32 v2, 0 7533; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7534; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s6 7535; GFX9-NEXT: s_add_u32 s4, s4, -1 7536; GFX9-NEXT: s_addc_u32 s5, s5, -1 7537; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] 7538; GFX9-NEXT: v_mov_b32_e32 v0, s2 7539; GFX9-NEXT: v_mov_b32_e32 v1, s3 7540; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7541; GFX9-NEXT: s_endpgm 7542 %shl.y = shl i64 4096, %y 7543 %r = urem i64 %x, %shl.y 7544 store i64 %r, ptr addrspace(1) %out 7545 ret void 7546} 7547 7548define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 7549; CHECK-LABEL: @urem_v2i64_pow2k_denom( 7550; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7551; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 7552; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 7553; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7554; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 7555; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7556; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 7557; CHECK-NEXT: ret void 7558; 7559; GFX6-LABEL: urem_v2i64_pow2k_denom: 7560; GFX6: ; %bb.0: 7561; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 7562; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 7563; GFX6-NEXT: v_mov_b32_e32 v1, 0 7564; GFX6-NEXT: s_mov_b32 s7, 0xf000 7565; GFX6-NEXT: s_mov_b32 s6, -1 7566; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7567; GFX6-NEXT: s_and_b32 s0, s0, 0xfff 7568; GFX6-NEXT: s_and_b32 s1, s2, 0xfff 7569; GFX6-NEXT: v_mov_b32_e32 v0, s0 7570; GFX6-NEXT: v_mov_b32_e32 v2, s1 7571; GFX6-NEXT: v_mov_b32_e32 v3, v1 7572; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7573; GFX6-NEXT: s_endpgm 7574; 7575; GFX9-LABEL: urem_v2i64_pow2k_denom: 7576; GFX9: ; %bb.0: 7577; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 7578; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 7579; GFX9-NEXT: v_mov_b32_e32 v1, 0 7580; GFX9-NEXT: v_mov_b32_e32 v3, v1 7581; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7582; GFX9-NEXT: s_and_b32 s0, s0, 0xfff 7583; GFX9-NEXT: s_and_b32 s1, s2, 0xfff 7584; GFX9-NEXT: v_mov_b32_e32 v0, s0 7585; GFX9-NEXT: v_mov_b32_e32 v2, s1 7586; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[6:7] 7587; GFX9-NEXT: s_endpgm 7588 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 7589 store <2 x i64> %r, ptr addrspace(1) %out 7590 ret void 7591} 7592 7593define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { 7594; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 7595; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]] 7596; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7597; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7598; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 7599; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 7600; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7601; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7602; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 7603; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7604; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16 7605; CHECK-NEXT: ret void 7606; 7607; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 7608; GFX6: ; %bb.0: 7609; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 7610; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 7611; GFX6-NEXT: s_mov_b32 s3, 0xf000 7612; GFX6-NEXT: s_mov_b32 s2, -1 7613; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7614; GFX6-NEXT: s_lshl_b64 s[4:5], 0x1000, s14 7615; GFX6-NEXT: s_lshl_b64 s[6:7], 0x1000, s12 7616; GFX6-NEXT: s_add_u32 s6, s6, -1 7617; GFX6-NEXT: s_addc_u32 s7, s7, -1 7618; GFX6-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7] 7619; GFX6-NEXT: s_add_u32 s4, s4, -1 7620; GFX6-NEXT: s_addc_u32 s5, s5, -1 7621; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] 7622; GFX6-NEXT: v_mov_b32_e32 v0, s6 7623; GFX6-NEXT: v_mov_b32_e32 v1, s7 7624; GFX6-NEXT: v_mov_b32_e32 v2, s4 7625; GFX6-NEXT: v_mov_b32_e32 v3, s5 7626; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7627; GFX6-NEXT: s_endpgm 7628; 7629; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 7630; GFX9: ; %bb.0: 7631; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 7632; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 7633; GFX9-NEXT: v_mov_b32_e32 v4, 0 7634; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7635; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s14 7636; GFX9-NEXT: s_lshl_b64 s[4:5], 0x1000, s12 7637; GFX9-NEXT: s_add_u32 s4, s4, -1 7638; GFX9-NEXT: s_addc_u32 s5, s5, -1 7639; GFX9-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] 7640; GFX9-NEXT: s_add_u32 s2, s2, -1 7641; GFX9-NEXT: s_addc_u32 s3, s3, -1 7642; GFX9-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 7643; GFX9-NEXT: v_mov_b32_e32 v0, s4 7644; GFX9-NEXT: v_mov_b32_e32 v1, s5 7645; GFX9-NEXT: v_mov_b32_e32 v2, s2 7646; GFX9-NEXT: v_mov_b32_e32 v3, s3 7647; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 7648; GFX9-NEXT: s_endpgm 7649 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7650 %r = urem <2 x i64> %x, %shl.y 7651 store <2 x i64> %r, ptr addrspace(1) %out 7652 ret void 7653} 7654 7655define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { 7656; CHECK-LABEL: @sdiv_i64_oddk_denom( 7657; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 7658; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7659; CHECK-NEXT: ret void 7660; 7661; GFX6-LABEL: sdiv_i64_oddk_denom: 7662; GFX6: ; %bb.0: 7663; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7664; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19 7665; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220 7666; GFX6-NEXT: s_mov_b32 s7, 0xf000 7667; GFX6-NEXT: s_mov_b32 s6, -1 7668; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7669; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 7670; GFX6-NEXT: v_mul_hi_u32 v4, s3, v2 7671; GFX6-NEXT: s_mov_b32 s5, s1 7672; GFX6-NEXT: v_mul_hi_u32 v1, s2, v0 7673; GFX6-NEXT: s_mul_i32 s1, s3, 0xfd81e19 7674; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v3 7675; GFX6-NEXT: s_mov_b32 s4, s0 7676; GFX6-NEXT: s_mul_i32 s0, s2, 0x6ca94220 7677; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 7678; GFX6-NEXT: v_add_i32_e32 v3, vcc, s0, v3 7679; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7680; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 7681; GFX6-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc 7682; GFX6-NEXT: s_ashr_i32 s1, s3, 31 7683; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7684; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 7685; GFX6-NEXT: s_mul_i32 s0, s3, 0x6ca94220 7686; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 7687; GFX6-NEXT: s_mul_i32 s0, s1, 0x6ca94220 7688; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc 7689; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2 7690; GFX6-NEXT: s_mul_i32 s1, s1, 0xfd81e19 7691; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v0 7692; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v1 7693; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 7694; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 7695; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 7696; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7697; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 7698; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7699; GFX6-NEXT: s_endpgm 7700; 7701; GFX9-LABEL: sdiv_i64_oddk_denom: 7702; GFX9: ; %bb.0: 7703; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7704; GFX9-NEXT: v_mov_b32_e32 v2, 0 7705; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7706; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220 7707; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220 7708; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19 7709; GFX9-NEXT: s_mul_hi_u32 s2, s2, 0xfd81e19 7710; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 7711; GFX9-NEXT: s_add_u32 s2, s7, s2 7712; GFX9-NEXT: s_addc_u32 s6, s6, 0 7713; GFX9-NEXT: s_add_u32 s2, s5, s2 7714; GFX9-NEXT: s_addc_u32 s2, s4, 0 7715; GFX9-NEXT: s_add_u32 s2, s6, s2 7716; GFX9-NEXT: s_addc_u32 s4, 0, 0 7717; GFX9-NEXT: s_mul_i32 s6, s3, 0x6ca94220 7718; GFX9-NEXT: s_mul_hi_u32 s5, s3, 0x6ca94220 7719; GFX9-NEXT: s_add_u32 s2, s6, s2 7720; GFX9-NEXT: s_addc_u32 s4, s5, s4 7721; GFX9-NEXT: s_ashr_i32 s3, s3, 31 7722; GFX9-NEXT: s_mul_i32 s5, s3, 0x6ca94220 7723; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 7724; GFX9-NEXT: s_add_i32 s5, s6, s5 7725; GFX9-NEXT: s_mul_i32 s3, s3, 0xfd81e19 7726; GFX9-NEXT: s_add_i32 s5, s5, s3 7727; GFX9-NEXT: s_add_u32 s2, s2, s3 7728; GFX9-NEXT: s_addc_u32 s3, s4, s5 7729; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 19 7730; GFX9-NEXT: s_lshr_b32 s2, s3, 31 7731; GFX9-NEXT: s_add_u32 s2, s4, s2 7732; GFX9-NEXT: s_addc_u32 s3, s5, 0 7733; GFX9-NEXT: v_mov_b32_e32 v0, s2 7734; GFX9-NEXT: v_mov_b32_e32 v1, s3 7735; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7736; GFX9-NEXT: s_endpgm 7737 %r = sdiv i64 %x, 1235195 7738 store i64 %r, ptr addrspace(1) %out 7739 ret void 7740} 7741 7742define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { 7743; CHECK-LABEL: @sdiv_i64_pow2k_denom( 7744; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 7745; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7746; CHECK-NEXT: ret void 7747; 7748; GFX6-LABEL: sdiv_i64_pow2k_denom: 7749; GFX6: ; %bb.0: 7750; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7751; GFX6-NEXT: s_mov_b32 s7, 0xf000 7752; GFX6-NEXT: s_mov_b32 s6, -1 7753; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7754; GFX6-NEXT: s_mov_b32 s4, s0 7755; GFX6-NEXT: s_ashr_i32 s0, s3, 31 7756; GFX6-NEXT: s_lshr_b32 s0, s0, 20 7757; GFX6-NEXT: s_add_u32 s0, s2, s0 7758; GFX6-NEXT: s_mov_b32 s5, s1 7759; GFX6-NEXT: s_addc_u32 s1, s3, 0 7760; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 7761; GFX6-NEXT: v_mov_b32_e32 v0, s0 7762; GFX6-NEXT: v_mov_b32_e32 v1, s1 7763; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7764; GFX6-NEXT: s_endpgm 7765; 7766; GFX9-LABEL: sdiv_i64_pow2k_denom: 7767; GFX9: ; %bb.0: 7768; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 7769; GFX9-NEXT: v_mov_b32_e32 v2, 0 7770; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7771; GFX9-NEXT: s_ashr_i32 s4, s3, 31 7772; GFX9-NEXT: s_lshr_b32 s4, s4, 20 7773; GFX9-NEXT: s_add_u32 s2, s2, s4 7774; GFX9-NEXT: s_addc_u32 s3, s3, 0 7775; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 7776; GFX9-NEXT: v_mov_b32_e32 v0, s2 7777; GFX9-NEXT: v_mov_b32_e32 v1, s3 7778; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7779; GFX9-NEXT: s_endpgm 7780 %r = sdiv i64 %x, 4096 7781 store i64 %r, ptr addrspace(1) %out 7782 ret void 7783} 7784 7785define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) { 7786; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 7787; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7788; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 7789; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 7790; CHECK-NEXT: ret void 7791; 7792; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 7793; GFX6: ; %bb.0: 7794; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd 7795; GFX6-NEXT: s_mov_b32 s7, 0xf000 7796; GFX6-NEXT: s_mov_b32 s6, -1 7797; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7798; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 7799; GFX6-NEXT: s_ashr_i32 s8, s1, 31 7800; GFX6-NEXT: s_add_u32 s0, s0, s8 7801; GFX6-NEXT: s_mov_b32 s9, s8 7802; GFX6-NEXT: s_addc_u32 s1, s1, s8 7803; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] 7804; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 7805; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 7806; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 7807; GFX6-NEXT: s_sub_u32 s4, 0, s10 7808; GFX6-NEXT: s_subb_u32 s5, 0, s11 7809; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 7810; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7811; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7812; GFX6-NEXT: s_ashr_i32 s12, s3, 31 7813; GFX6-NEXT: s_add_u32 s2, s2, s12 7814; GFX6-NEXT: s_mov_b32 s13, s12 7815; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7816; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7817; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7818; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 7819; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7820; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7821; GFX6-NEXT: s_addc_u32 s3, s3, s12 7822; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 7823; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 7824; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 7825; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 7826; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 7827; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7828; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 7829; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 7830; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7831; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 7832; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 7833; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 7834; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 7835; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 7836; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 7837; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7838; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7839; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 7840; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 7841; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7842; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7843; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7844; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7845; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 7846; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 7847; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 7848; GFX6-NEXT: s_mov_b32 s5, s1 7849; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7850; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 7851; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7852; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 7853; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 7854; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 7855; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 7856; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 7857; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 7858; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 7859; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 7860; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7861; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 7862; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 7863; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 7864; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7865; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7866; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7867; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7868; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 7869; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 7870; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7871; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 7872; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 7873; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7874; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7875; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 7876; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7877; GFX6-NEXT: s_mov_b32 s4, s0 7878; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7879; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7880; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7881; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7882; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7883; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 7884; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 7885; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 7886; GFX6-NEXT: v_mov_b32_e32 v5, s11 7887; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7888; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 7889; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7890; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 7891; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 7892; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7893; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 7894; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7895; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 7896; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7897; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 7898; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7899; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 7900; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7901; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 7902; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 7903; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 7904; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 7905; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7906; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] 7907; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] 7908; GFX6-NEXT: v_mov_b32_e32 v6, s3 7909; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 7910; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 7911; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7912; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 7913; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7914; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 7915; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 7916; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7917; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7918; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 7919; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 7920; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 7921; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 7922; GFX6-NEXT: v_mov_b32_e32 v2, s1 7923; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 7924; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 7925; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7926; GFX6-NEXT: s_endpgm 7927; 7928; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 7929; GFX9: ; %bb.0: 7930; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 7931; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 7932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7933; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 7934; GFX9-NEXT: s_ashr_i32 s2, s1, 31 7935; GFX9-NEXT: s_add_u32 s0, s0, s2 7936; GFX9-NEXT: s_mov_b32 s3, s2 7937; GFX9-NEXT: s_addc_u32 s1, s1, s2 7938; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] 7939; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 7940; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 7941; GFX9-NEXT: s_sub_u32 s0, 0, s6 7942; GFX9-NEXT: s_subb_u32 s1, 0, s7 7943; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 7944; GFX9-NEXT: v_rcp_f32_e32 v1, v0 7945; GFX9-NEXT: v_mov_b32_e32 v0, 0 7946; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 7947; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 7948; GFX9-NEXT: v_trunc_f32_e32 v2, v2 7949; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 7950; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 7951; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7952; GFX9-NEXT: v_readfirstlane_b32 s4, v2 7953; GFX9-NEXT: v_readfirstlane_b32 s5, v1 7954; GFX9-NEXT: s_mul_i32 s12, s0, s4 7955; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5 7956; GFX9-NEXT: s_mul_i32 s13, s1, s5 7957; GFX9-NEXT: s_add_i32 s12, s14, s12 7958; GFX9-NEXT: s_mul_i32 s15, s0, s5 7959; GFX9-NEXT: s_add_i32 s12, s12, s13 7960; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15 7961; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12 7962; GFX9-NEXT: s_mul_i32 s5, s5, s12 7963; GFX9-NEXT: s_add_u32 s5, s14, s5 7964; GFX9-NEXT: s_addc_u32 s13, 0, s13 7965; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 7966; GFX9-NEXT: s_mul_i32 s15, s4, s15 7967; GFX9-NEXT: s_add_u32 s5, s5, s15 7968; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12 7969; GFX9-NEXT: s_addc_u32 s5, s13, s16 7970; GFX9-NEXT: s_addc_u32 s13, s14, 0 7971; GFX9-NEXT: s_mul_i32 s12, s4, s12 7972; GFX9-NEXT: s_add_u32 s5, s5, s12 7973; GFX9-NEXT: s_addc_u32 s12, 0, s13 7974; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1 7975; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7976; GFX9-NEXT: s_addc_u32 s4, s4, s12 7977; GFX9-NEXT: v_readfirstlane_b32 s12, v1 7978; GFX9-NEXT: s_mul_i32 s5, s0, s4 7979; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 7980; GFX9-NEXT: s_add_i32 s5, s13, s5 7981; GFX9-NEXT: s_mul_i32 s1, s1, s12 7982; GFX9-NEXT: s_add_i32 s5, s5, s1 7983; GFX9-NEXT: s_mul_i32 s0, s0, s12 7984; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0 7985; GFX9-NEXT: s_mul_i32 s14, s4, s0 7986; GFX9-NEXT: s_mul_i32 s16, s12, s5 7987; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 7988; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5 7989; GFX9-NEXT: s_add_u32 s0, s0, s16 7990; GFX9-NEXT: s_addc_u32 s12, 0, s15 7991; GFX9-NEXT: s_add_u32 s0, s0, s14 7992; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 7993; GFX9-NEXT: s_addc_u32 s0, s12, s13 7994; GFX9-NEXT: s_addc_u32 s1, s1, 0 7995; GFX9-NEXT: s_mul_i32 s5, s4, s5 7996; GFX9-NEXT: s_add_u32 s0, s0, s5 7997; GFX9-NEXT: s_addc_u32 s1, 0, s1 7998; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 7999; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8000; GFX9-NEXT: s_addc_u32 s12, s4, s1 8001; GFX9-NEXT: s_ashr_i32 s4, s11, 31 8002; GFX9-NEXT: s_add_u32 s0, s10, s4 8003; GFX9-NEXT: s_mov_b32 s5, s4 8004; GFX9-NEXT: s_addc_u32 s1, s11, s4 8005; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] 8006; GFX9-NEXT: v_readfirstlane_b32 s13, v1 8007; GFX9-NEXT: s_mul_i32 s1, s10, s12 8008; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 8009; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 8010; GFX9-NEXT: s_add_u32 s1, s14, s1 8011; GFX9-NEXT: s_addc_u32 s0, 0, s0 8012; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 8013; GFX9-NEXT: s_mul_i32 s13, s11, s13 8014; GFX9-NEXT: s_add_u32 s1, s1, s13 8015; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 8016; GFX9-NEXT: s_addc_u32 s0, s0, s15 8017; GFX9-NEXT: s_addc_u32 s1, s14, 0 8018; GFX9-NEXT: s_mul_i32 s12, s11, s12 8019; GFX9-NEXT: s_add_u32 s12, s0, s12 8020; GFX9-NEXT: s_addc_u32 s13, 0, s1 8021; GFX9-NEXT: s_mul_i32 s0, s6, s13 8022; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 8023; GFX9-NEXT: s_add_i32 s0, s1, s0 8024; GFX9-NEXT: s_mul_i32 s1, s7, s12 8025; GFX9-NEXT: s_add_i32 s14, s0, s1 8026; GFX9-NEXT: s_mul_i32 s1, s6, s12 8027; GFX9-NEXT: v_mov_b32_e32 v1, s1 8028; GFX9-NEXT: s_sub_i32 s0, s11, s14 8029; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 8030; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8031; GFX9-NEXT: s_subb_u32 s10, s0, s7 8032; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 8033; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 8034; GFX9-NEXT: s_subb_u32 s10, s10, 0 8035; GFX9-NEXT: s_cmp_ge_u32 s10, s7 8036; GFX9-NEXT: s_cselect_b32 s15, -1, 0 8037; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 8038; GFX9-NEXT: s_cmp_eq_u32 s10, s7 8039; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] 8040; GFX9-NEXT: v_mov_b32_e32 v3, s15 8041; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 8042; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] 8043; GFX9-NEXT: s_add_u32 s0, s12, 1 8044; GFX9-NEXT: s_addc_u32 s10, s13, 0 8045; GFX9-NEXT: s_add_u32 s1, s12, 2 8046; GFX9-NEXT: s_addc_u32 s15, s13, 0 8047; GFX9-NEXT: v_mov_b32_e32 v3, s0 8048; GFX9-NEXT: v_mov_b32_e32 v4, s1 8049; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 8050; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 8051; GFX9-NEXT: v_mov_b32_e32 v3, s10 8052; GFX9-NEXT: v_mov_b32_e32 v4, s15 8053; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8054; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 8055; GFX9-NEXT: s_subb_u32 s0, s11, s14 8056; GFX9-NEXT: s_cmp_ge_u32 s0, s7 8057; GFX9-NEXT: s_cselect_b32 s1, -1, 0 8058; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 8059; GFX9-NEXT: s_cmp_eq_u32 s0, s7 8060; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 8061; GFX9-NEXT: v_mov_b32_e32 v4, s1 8062; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 8063; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 8064; GFX9-NEXT: v_mov_b32_e32 v4, s13 8065; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 8066; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc 8067; GFX9-NEXT: v_mov_b32_e32 v3, s12 8068; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 8069; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] 8070; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 8071; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 8072; GFX9-NEXT: v_mov_b32_e32 v4, s1 8073; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 8074; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc 8075; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] 8076; GFX9-NEXT: s_endpgm 8077 %shl.y = shl i64 4096, %y 8078 %r = sdiv i64 %x, %shl.y 8079 store i64 %r, ptr addrspace(1) %out 8080 ret void 8081} 8082 8083define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 8084; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 8085; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8086; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8087; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 8088; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8089; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 8090; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8091; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 8092; CHECK-NEXT: ret void 8093; 8094; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 8095; GFX6: ; %bb.0: 8096; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 8097; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 8098; GFX6-NEXT: s_mov_b32 s7, 0xf000 8099; GFX6-NEXT: s_mov_b32 s6, -1 8100; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8101; GFX6-NEXT: s_ashr_i32 s8, s1, 31 8102; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8103; GFX6-NEXT: s_add_u32 s0, s0, s8 8104; GFX6-NEXT: s_addc_u32 s1, s1, 0 8105; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8106; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8107; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8108; GFX6-NEXT: s_add_u32 s2, s2, s8 8109; GFX6-NEXT: s_addc_u32 s3, s3, 0 8110; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8111; GFX6-NEXT: v_mov_b32_e32 v0, s0 8112; GFX6-NEXT: v_mov_b32_e32 v1, s1 8113; GFX6-NEXT: v_mov_b32_e32 v2, s2 8114; GFX6-NEXT: v_mov_b32_e32 v3, s3 8115; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8116; GFX6-NEXT: s_endpgm 8117; 8118; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 8119; GFX9: ; %bb.0: 8120; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 8121; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 8122; GFX9-NEXT: v_mov_b32_e32 v4, 0 8123; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8124; GFX9-NEXT: s_ashr_i32 s4, s1, 31 8125; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8126; GFX9-NEXT: s_add_u32 s0, s0, s4 8127; GFX9-NEXT: s_addc_u32 s1, s1, 0 8128; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8129; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8130; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8131; GFX9-NEXT: s_add_u32 s2, s2, s4 8132; GFX9-NEXT: s_addc_u32 s3, s3, 0 8133; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8134; GFX9-NEXT: v_mov_b32_e32 v0, s0 8135; GFX9-NEXT: v_mov_b32_e32 v1, s1 8136; GFX9-NEXT: v_mov_b32_e32 v2, s2 8137; GFX9-NEXT: v_mov_b32_e32 v3, s3 8138; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 8139; GFX9-NEXT: s_endpgm 8140 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 8141 store <2 x i64> %r, ptr addrspace(1) %out 8142 ret void 8143} 8144 8145define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 8146; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 8147; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8148; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8149; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 8150; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8151; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 8152; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8153; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 8154; CHECK-NEXT: ret void 8155; 8156; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8157; GFX6: ; %bb.0: 8158; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xd 8159; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 8160; GFX6-NEXT: v_mov_b32_e32 v2, 0x8008009 8161; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080080 8162; GFX6-NEXT: s_mov_b32 s3, 0xf000 8163; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8164; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 8165; GFX6-NEXT: v_mul_hi_u32 v4, s11, v2 8166; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 8167; GFX6-NEXT: s_mul_i32 s7, s11, 0x8008009 8168; GFX6-NEXT: v_add_i32_e32 v3, vcc, s7, v3 8169; GFX6-NEXT: s_mul_i32 s6, s10, 0x80080080 8170; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8171; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3 8172; GFX6-NEXT: s_ashr_i32 s4, s9, 31 8173; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8174; GFX6-NEXT: s_lshr_b32 s4, s4, 20 8175; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 8176; GFX6-NEXT: s_add_u32 s4, s8, s4 8177; GFX6-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc 8178; GFX6-NEXT: s_addc_u32 s5, s9, 0 8179; GFX6-NEXT: s_ashr_i32 s7, s11, 31 8180; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 8181; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 8182; GFX6-NEXT: s_mul_i32 s6, s11, 0x80080080 8183; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1 8184; GFX6-NEXT: s_mul_i32 s6, s7, 0x80080080 8185; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc 8186; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 8187; GFX6-NEXT: s_mul_i32 s6, s7, 0x8008009 8188; GFX6-NEXT: v_add_i32_e32 v2, vcc, s6, v2 8189; GFX6-NEXT: v_mov_b32_e32 v3, s6 8190; GFX6-NEXT: v_mov_b32_e32 v4, s11 8191; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v3 8192; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc 8193; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8194; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc 8195; GFX6-NEXT: v_mov_b32_e32 v3, s11 8196; GFX6-NEXT: v_add_i32_e32 v0, vcc, s10, v1 8197; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc 8198; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 11 8199; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 8200; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8201; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 8202; GFX6-NEXT: s_mov_b32 s2, -1 8203; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 8204; GFX6-NEXT: v_mov_b32_e32 v0, s4 8205; GFX6-NEXT: v_mov_b32_e32 v1, s5 8206; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 8207; GFX6-NEXT: s_endpgm 8208; 8209; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8210; GFX9: ; %bb.0: 8211; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 8212; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 8213; GFX9-NEXT: v_mov_b32_e32 v4, 0 8214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8215; GFX9-NEXT: s_ashr_i32 s4, s1, 31 8216; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8217; GFX9-NEXT: s_add_u32 s0, s0, s4 8218; GFX9-NEXT: s_addc_u32 s1, s1, 0 8219; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8220; GFX9-NEXT: s_mul_i32 s9, s3, 0x8008009 8221; GFX9-NEXT: s_mul_hi_u32 s10, s2, 0x8008009 8222; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x8008009 8223; GFX9-NEXT: s_add_u32 s9, s9, s10 8224; GFX9-NEXT: s_mul_i32 s8, s2, 0x80080080 8225; GFX9-NEXT: s_addc_u32 s4, s4, 0 8226; GFX9-NEXT: s_mul_hi_u32 s5, s2, 0x80080080 8227; GFX9-NEXT: s_add_u32 s8, s8, s9 8228; GFX9-NEXT: s_addc_u32 s5, s5, 0 8229; GFX9-NEXT: s_add_u32 s4, s4, s5 8230; GFX9-NEXT: s_addc_u32 s5, 0, 0 8231; GFX9-NEXT: s_mul_i32 s9, s3, 0x80080080 8232; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x80080080 8233; GFX9-NEXT: s_add_u32 s4, s9, s4 8234; GFX9-NEXT: s_addc_u32 s5, s8, s5 8235; GFX9-NEXT: s_ashr_i32 s8, s3, 31 8236; GFX9-NEXT: s_mul_i32 s9, s8, 0x80080080 8237; GFX9-NEXT: s_mul_hi_u32 s10, s8, 0x8008009 8238; GFX9-NEXT: s_add_i32 s9, s10, s9 8239; GFX9-NEXT: s_mul_i32 s8, s8, 0x8008009 8240; GFX9-NEXT: s_add_i32 s9, s9, s8 8241; GFX9-NEXT: s_sub_u32 s8, s8, s2 8242; GFX9-NEXT: s_subb_u32 s9, s9, s3 8243; GFX9-NEXT: s_add_u32 s4, s4, s8 8244; GFX9-NEXT: s_addc_u32 s5, s5, s9 8245; GFX9-NEXT: s_add_u32 s2, s4, s2 8246; GFX9-NEXT: s_addc_u32 s3, s5, s3 8247; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 11 8248; GFX9-NEXT: s_lshr_b32 s2, s3, 31 8249; GFX9-NEXT: s_add_u32 s2, s4, s2 8250; GFX9-NEXT: s_addc_u32 s3, s5, 0 8251; GFX9-NEXT: v_mov_b32_e32 v0, s0 8252; GFX9-NEXT: v_mov_b32_e32 v1, s1 8253; GFX9-NEXT: v_mov_b32_e32 v2, s2 8254; GFX9-NEXT: v_mov_b32_e32 v3, s3 8255; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 8256; GFX9-NEXT: s_endpgm 8257 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 8258 store <2 x i64> %r, ptr addrspace(1) %out 8259 ret void 8260} 8261 8262define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { 8263; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 8264; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]] 8265; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8266; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 8267; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 8268; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 8269; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 8270; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 8271; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 8272; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 8273; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16 8274; CHECK-NEXT: ret void 8275; 8276; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 8277; GFX6: ; %bb.0: 8278; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 8279; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 8280; GFX6-NEXT: s_mov_b32 s7, 0xf000 8281; GFX6-NEXT: s_mov_b32 s6, -1 8282; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8283; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 8284; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 8285; GFX6-NEXT: s_ashr_i32 s12, s1, 31 8286; GFX6-NEXT: s_add_u32 s0, s0, s12 8287; GFX6-NEXT: s_mov_b32 s13, s12 8288; GFX6-NEXT: s_addc_u32 s1, s1, s12 8289; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[12:13] 8290; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8291; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 8292; GFX6-NEXT: s_sub_u32 s0, 0, s2 8293; GFX6-NEXT: s_subb_u32 s1, 0, s3 8294; GFX6-NEXT: s_ashr_i32 s16, s9, 31 8295; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8296; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8297; GFX6-NEXT: s_mov_b32 s17, s16 8298; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8299; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8300; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8301; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8302; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8303; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8304; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8305; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8306; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 8307; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 8308; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8309; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8310; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8311; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8312; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 8313; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 8314; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 8315; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 8316; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8317; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 8318; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8319; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 8320; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc 8321; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 8322; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8323; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8324; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8325; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8326; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8327; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8328; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 8329; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8330; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 8331; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8332; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8333; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8334; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8335; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8336; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8337; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8338; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8339; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8340; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8341; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8342; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8343; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8344; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8345; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8346; GFX6-NEXT: s_add_u32 s0, s8, s16 8347; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8348; GFX6-NEXT: s_addc_u32 s1, s9, s16 8349; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8350; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[16:17] 8351; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 8352; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 8353; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 8354; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 8355; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 8356; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8357; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8358; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 8359; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 8360; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8361; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8362; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8363; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8364; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8365; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8366; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8367; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8368; GFX6-NEXT: v_mov_b32_e32 v5, s3 8369; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8370; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 8371; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 8372; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v2 8373; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 8374; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 8375; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 8376; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8377; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 8378; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8379; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 8380; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8381; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 8382; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 8383; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 8384; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 8385; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 8386; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 8387; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8388; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] 8389; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] 8390; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[12:13] 8391; GFX6-NEXT: s_ashr_i32 s8, s15, 31 8392; GFX6-NEXT: s_add_u32 s12, s14, s8 8393; GFX6-NEXT: v_mov_b32_e32 v6, s9 8394; GFX6-NEXT: s_mov_b32 s9, s8 8395; GFX6-NEXT: s_addc_u32 s13, s15, s8 8396; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 8397; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 8398; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 8399; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 8400; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8401; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 8402; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 8403; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 8404; GFX6-NEXT: v_rcp_f32_e32 v6, v6 8405; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8406; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 8407; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 8408; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8409; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 8410; GFX6-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 8411; GFX6-NEXT: v_trunc_f32_e32 v3, v3 8412; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 8413; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 8414; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 8415; GFX6-NEXT: s_sub_u32 s2, 0, s12 8416; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 8417; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8418; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 8419; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 8420; GFX6-NEXT: s_subb_u32 s3, 0, s13 8421; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 8422; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 8423; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8424; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 8425; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8426; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 8427; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 8428; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 8429; GFX6-NEXT: v_mul_hi_u32 v9, v3, v4 8430; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 8431; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8432; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8433; GFX6-NEXT: v_mul_lo_u32 v8, v3, v5 8434; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 8435; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 8436; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 8437; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 8438; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 8439; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8440; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8441; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8442; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 8443; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 8444; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 8445; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 8446; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 8447; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 8448; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8449; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 8450; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 8451; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 8452; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 8453; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 8454; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 8455; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 8456; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 8457; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 8458; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 8459; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 8460; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 8461; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8462; GFX6-NEXT: s_ashr_i32 s2, s11, 31 8463; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8464; GFX6-NEXT: s_add_u32 s10, s10, s2 8465; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8466; GFX6-NEXT: s_mov_b32 s3, s2 8467; GFX6-NEXT: s_addc_u32 s11, s11, s2 8468; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 8469; GFX6-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] 8470; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 8471; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 8472; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 8473; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 8474; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 8475; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8476; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 8477; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 8478; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 8479; GFX6-NEXT: v_mov_b32_e32 v6, s1 8480; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 8481; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 8482; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 8483; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8484; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8485; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 8486; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 8487; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 8488; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 8489; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 8490; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 8491; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 8492; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 8493; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 8494; GFX6-NEXT: v_mov_b32_e32 v7, s13 8495; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 8496; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 8497; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 8498; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 8499; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 8500; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 8501; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 8502; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 8503; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 8504; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 8505; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 8506; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 8507; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 2, v2 8508; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 8509; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 8510; GFX6-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1] 8511; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] 8512; GFX6-NEXT: v_mov_b32_e32 v8, s11 8513; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 8514; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 8515; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 8516; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 8517; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8518; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 8519; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 8520; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 8521; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 8522; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] 8523; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 8524; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 8525; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 8526; GFX6-NEXT: v_mov_b32_e32 v4, s1 8527; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 8528; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 8529; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8530; GFX6-NEXT: s_endpgm 8531; 8532; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 8533; GFX9: ; %bb.0: 8534; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 8535; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 8536; GFX9-NEXT: v_mov_b32_e32 v4, 0 8537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8538; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 8539; GFX9-NEXT: s_lshl_b64 s[6:7], 0x1000, s14 8540; GFX9-NEXT: s_ashr_i32 s12, s1, 31 8541; GFX9-NEXT: s_add_u32 s0, s0, s12 8542; GFX9-NEXT: s_mov_b32 s13, s12 8543; GFX9-NEXT: s_addc_u32 s1, s1, s12 8544; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] 8545; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 8546; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 8547; GFX9-NEXT: s_sub_u32 s0, 0, s14 8548; GFX9-NEXT: s_subb_u32 s1, 0, s15 8549; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8550; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8551; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8552; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8553; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8554; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8555; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8556; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8557; GFX9-NEXT: v_readfirstlane_b32 s4, v1 8558; GFX9-NEXT: v_readfirstlane_b32 s5, v0 8559; GFX9-NEXT: s_mul_i32 s16, s0, s4 8560; GFX9-NEXT: s_mul_hi_u32 s18, s0, s5 8561; GFX9-NEXT: s_mul_i32 s17, s1, s5 8562; GFX9-NEXT: s_add_i32 s16, s18, s16 8563; GFX9-NEXT: s_mul_i32 s19, s0, s5 8564; GFX9-NEXT: s_add_i32 s16, s16, s17 8565; GFX9-NEXT: s_mul_hi_u32 s17, s5, s16 8566; GFX9-NEXT: s_mul_i32 s18, s5, s16 8567; GFX9-NEXT: s_mul_hi_u32 s5, s5, s19 8568; GFX9-NEXT: s_add_u32 s5, s5, s18 8569; GFX9-NEXT: s_addc_u32 s17, 0, s17 8570; GFX9-NEXT: s_mul_hi_u32 s20, s4, s19 8571; GFX9-NEXT: s_mul_i32 s19, s4, s19 8572; GFX9-NEXT: s_add_u32 s5, s5, s19 8573; GFX9-NEXT: s_mul_hi_u32 s18, s4, s16 8574; GFX9-NEXT: s_addc_u32 s5, s17, s20 8575; GFX9-NEXT: s_addc_u32 s17, s18, 0 8576; GFX9-NEXT: s_mul_i32 s16, s4, s16 8577; GFX9-NEXT: s_add_u32 s5, s5, s16 8578; GFX9-NEXT: s_addc_u32 s16, 0, s17 8579; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 8580; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8581; GFX9-NEXT: s_addc_u32 s4, s4, s16 8582; GFX9-NEXT: v_readfirstlane_b32 s16, v0 8583; GFX9-NEXT: s_mul_i32 s5, s0, s4 8584; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 8585; GFX9-NEXT: s_add_i32 s5, s17, s5 8586; GFX9-NEXT: s_mul_i32 s1, s1, s16 8587; GFX9-NEXT: s_add_i32 s5, s5, s1 8588; GFX9-NEXT: s_mul_i32 s0, s0, s16 8589; GFX9-NEXT: s_mul_hi_u32 s17, s4, s0 8590; GFX9-NEXT: s_mul_i32 s18, s4, s0 8591; GFX9-NEXT: s_mul_i32 s20, s16, s5 8592; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 8593; GFX9-NEXT: s_mul_hi_u32 s19, s16, s5 8594; GFX9-NEXT: s_add_u32 s0, s0, s20 8595; GFX9-NEXT: s_addc_u32 s16, 0, s19 8596; GFX9-NEXT: s_add_u32 s0, s0, s18 8597; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 8598; GFX9-NEXT: s_addc_u32 s0, s16, s17 8599; GFX9-NEXT: s_addc_u32 s1, s1, 0 8600; GFX9-NEXT: s_mul_i32 s5, s4, s5 8601; GFX9-NEXT: s_add_u32 s0, s0, s5 8602; GFX9-NEXT: s_addc_u32 s1, 0, s1 8603; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 8604; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8605; GFX9-NEXT: s_addc_u32 s16, s4, s1 8606; GFX9-NEXT: s_ashr_i32 s4, s9, 31 8607; GFX9-NEXT: s_add_u32 s0, s8, s4 8608; GFX9-NEXT: s_mov_b32 s5, s4 8609; GFX9-NEXT: s_addc_u32 s1, s9, s4 8610; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] 8611; GFX9-NEXT: v_readfirstlane_b32 s17, v0 8612; GFX9-NEXT: s_mul_i32 s1, s8, s16 8613; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 8614; GFX9-NEXT: s_mul_hi_u32 s0, s8, s16 8615; GFX9-NEXT: s_add_u32 s1, s18, s1 8616; GFX9-NEXT: s_addc_u32 s0, 0, s0 8617; GFX9-NEXT: s_mul_hi_u32 s19, s9, s17 8618; GFX9-NEXT: s_mul_i32 s17, s9, s17 8619; GFX9-NEXT: s_add_u32 s1, s1, s17 8620; GFX9-NEXT: s_mul_hi_u32 s18, s9, s16 8621; GFX9-NEXT: s_addc_u32 s0, s0, s19 8622; GFX9-NEXT: s_addc_u32 s1, s18, 0 8623; GFX9-NEXT: s_mul_i32 s16, s9, s16 8624; GFX9-NEXT: s_add_u32 s16, s0, s16 8625; GFX9-NEXT: s_addc_u32 s17, 0, s1 8626; GFX9-NEXT: s_mul_i32 s0, s14, s17 8627; GFX9-NEXT: s_mul_hi_u32 s1, s14, s16 8628; GFX9-NEXT: s_add_i32 s0, s1, s0 8629; GFX9-NEXT: s_mul_i32 s1, s15, s16 8630; GFX9-NEXT: s_add_i32 s18, s0, s1 8631; GFX9-NEXT: s_mul_i32 s1, s14, s16 8632; GFX9-NEXT: v_mov_b32_e32 v0, s1 8633; GFX9-NEXT: s_sub_i32 s0, s9, s18 8634; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 8635; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8636; GFX9-NEXT: s_subb_u32 s8, s0, s15 8637; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s14, v0 8638; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 8639; GFX9-NEXT: s_subb_u32 s8, s8, 0 8640; GFX9-NEXT: s_cmp_ge_u32 s8, s15 8641; GFX9-NEXT: s_cselect_b32 s19, -1, 0 8642; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v1 8643; GFX9-NEXT: s_cmp_eq_u32 s8, s15 8644; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] 8645; GFX9-NEXT: v_mov_b32_e32 v2, s19 8646; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 8647; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] 8648; GFX9-NEXT: s_add_u32 s0, s16, 1 8649; GFX9-NEXT: s_addc_u32 s8, s17, 0 8650; GFX9-NEXT: s_add_u32 s1, s16, 2 8651; GFX9-NEXT: s_addc_u32 s19, s17, 0 8652; GFX9-NEXT: v_mov_b32_e32 v2, s0 8653; GFX9-NEXT: v_mov_b32_e32 v3, s1 8654; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 8655; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] 8656; GFX9-NEXT: v_mov_b32_e32 v2, s8 8657; GFX9-NEXT: v_mov_b32_e32 v3, s19 8658; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8659; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8660; GFX9-NEXT: s_subb_u32 s0, s9, s18 8661; GFX9-NEXT: s_cmp_ge_u32 s0, s15 8662; GFX9-NEXT: s_cselect_b32 s1, -1, 0 8663; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 8664; GFX9-NEXT: s_cmp_eq_u32 s0, s15 8665; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 8666; GFX9-NEXT: v_mov_b32_e32 v3, s1 8667; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 8668; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[12:13] 8669; GFX9-NEXT: s_ashr_i32 s4, s7, 31 8670; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 8671; GFX9-NEXT: s_add_u32 s6, s6, s4 8672; GFX9-NEXT: v_mov_b32_e32 v3, s17 8673; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 8674; GFX9-NEXT: s_mov_b32 s5, s4 8675; GFX9-NEXT: s_addc_u32 s7, s7, s4 8676; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 8677; GFX9-NEXT: v_mov_b32_e32 v2, s16 8678; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] 8679; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 8680; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 8681; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 8682; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 8683; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 8684; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 8685; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 8686; GFX9-NEXT: v_rcp_f32_e32 v2, v2 8687; GFX9-NEXT: s_sub_u32 s0, 0, s6 8688; GFX9-NEXT: v_mov_b32_e32 v6, s1 8689; GFX9-NEXT: s_subb_u32 s1, 0, s7 8690; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 8691; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 8692; GFX9-NEXT: v_trunc_f32_e32 v3, v3 8693; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 8694; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 8695; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 8696; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc 8697; GFX9-NEXT: v_readfirstlane_b32 s8, v2 8698; GFX9-NEXT: v_readfirstlane_b32 s13, v3 8699; GFX9-NEXT: s_mul_hi_u32 s12, s0, s8 8700; GFX9-NEXT: s_mul_i32 s14, s0, s13 8701; GFX9-NEXT: s_mul_i32 s9, s1, s8 8702; GFX9-NEXT: s_add_i32 s12, s12, s14 8703; GFX9-NEXT: s_add_i32 s12, s12, s9 8704; GFX9-NEXT: s_mul_i32 s15, s0, s8 8705; GFX9-NEXT: s_mul_hi_u32 s9, s8, s12 8706; GFX9-NEXT: s_mul_i32 s14, s8, s12 8707; GFX9-NEXT: s_mul_hi_u32 s8, s8, s15 8708; GFX9-NEXT: s_add_u32 s8, s8, s14 8709; GFX9-NEXT: s_addc_u32 s9, 0, s9 8710; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15 8711; GFX9-NEXT: s_mul_i32 s15, s13, s15 8712; GFX9-NEXT: s_add_u32 s8, s8, s15 8713; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12 8714; GFX9-NEXT: s_addc_u32 s8, s9, s16 8715; GFX9-NEXT: s_addc_u32 s9, s14, 0 8716; GFX9-NEXT: s_mul_i32 s12, s13, s12 8717; GFX9-NEXT: s_add_u32 s8, s8, s12 8718; GFX9-NEXT: s_addc_u32 s9, 0, s9 8719; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 8720; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8721; GFX9-NEXT: s_addc_u32 s8, s13, s9 8722; GFX9-NEXT: v_readfirstlane_b32 s12, v2 8723; GFX9-NEXT: s_mul_i32 s9, s0, s8 8724; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 8725; GFX9-NEXT: s_add_i32 s9, s13, s9 8726; GFX9-NEXT: s_mul_i32 s1, s1, s12 8727; GFX9-NEXT: s_add_i32 s9, s9, s1 8728; GFX9-NEXT: s_mul_i32 s0, s0, s12 8729; GFX9-NEXT: s_mul_hi_u32 s13, s8, s0 8730; GFX9-NEXT: s_mul_i32 s14, s8, s0 8731; GFX9-NEXT: s_mul_i32 s16, s12, s9 8732; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 8733; GFX9-NEXT: s_mul_hi_u32 s15, s12, s9 8734; GFX9-NEXT: s_add_u32 s0, s0, s16 8735; GFX9-NEXT: s_addc_u32 s12, 0, s15 8736; GFX9-NEXT: s_add_u32 s0, s0, s14 8737; GFX9-NEXT: s_mul_hi_u32 s1, s8, s9 8738; GFX9-NEXT: s_addc_u32 s0, s12, s13 8739; GFX9-NEXT: s_addc_u32 s1, s1, 0 8740; GFX9-NEXT: s_mul_i32 s9, s8, s9 8741; GFX9-NEXT: s_add_u32 s0, s0, s9 8742; GFX9-NEXT: s_addc_u32 s1, 0, s1 8743; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 8744; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8745; GFX9-NEXT: s_addc_u32 s12, s8, s1 8746; GFX9-NEXT: s_ashr_i32 s8, s11, 31 8747; GFX9-NEXT: s_add_u32 s0, s10, s8 8748; GFX9-NEXT: s_mov_b32 s9, s8 8749; GFX9-NEXT: s_addc_u32 s1, s11, s8 8750; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] 8751; GFX9-NEXT: v_readfirstlane_b32 s13, v2 8752; GFX9-NEXT: s_mul_i32 s1, s10, s12 8753; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 8754; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 8755; GFX9-NEXT: s_add_u32 s1, s14, s1 8756; GFX9-NEXT: s_addc_u32 s0, 0, s0 8757; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 8758; GFX9-NEXT: s_mul_i32 s13, s11, s13 8759; GFX9-NEXT: s_add_u32 s1, s1, s13 8760; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 8761; GFX9-NEXT: s_addc_u32 s0, s0, s15 8762; GFX9-NEXT: s_addc_u32 s1, s14, 0 8763; GFX9-NEXT: s_mul_i32 s12, s11, s12 8764; GFX9-NEXT: s_add_u32 s12, s0, s12 8765; GFX9-NEXT: s_addc_u32 s13, 0, s1 8766; GFX9-NEXT: s_mul_i32 s0, s6, s13 8767; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 8768; GFX9-NEXT: s_add_i32 s0, s1, s0 8769; GFX9-NEXT: s_mul_i32 s1, s7, s12 8770; GFX9-NEXT: s_add_i32 s14, s0, s1 8771; GFX9-NEXT: s_mul_i32 s1, s6, s12 8772; GFX9-NEXT: v_mov_b32_e32 v2, s1 8773; GFX9-NEXT: s_sub_i32 s0, s11, s14 8774; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 8775; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8776; GFX9-NEXT: s_subb_u32 s10, s0, s7 8777; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s6, v2 8778; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 8779; GFX9-NEXT: s_subb_u32 s10, s10, 0 8780; GFX9-NEXT: s_cmp_ge_u32 s10, s7 8781; GFX9-NEXT: s_cselect_b32 s15, -1, 0 8782; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 8783; GFX9-NEXT: s_cmp_eq_u32 s10, s7 8784; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] 8785; GFX9-NEXT: v_mov_b32_e32 v5, s15 8786; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 8787; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] 8788; GFX9-NEXT: s_add_u32 s0, s12, 1 8789; GFX9-NEXT: s_addc_u32 s10, s13, 0 8790; GFX9-NEXT: s_add_u32 s1, s12, 2 8791; GFX9-NEXT: s_addc_u32 s15, s13, 0 8792; GFX9-NEXT: v_mov_b32_e32 v5, s0 8793; GFX9-NEXT: v_mov_b32_e32 v6, s1 8794; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 8795; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] 8796; GFX9-NEXT: v_mov_b32_e32 v5, s10 8797; GFX9-NEXT: v_mov_b32_e32 v6, s15 8798; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8799; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] 8800; GFX9-NEXT: s_subb_u32 s0, s11, s14 8801; GFX9-NEXT: s_cmp_ge_u32 s0, s7 8802; GFX9-NEXT: s_cselect_b32 s1, -1, 0 8803; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 8804; GFX9-NEXT: s_cmp_eq_u32 s0, s7 8805; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc 8806; GFX9-NEXT: v_mov_b32_e32 v6, s1 8807; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 8808; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8809; GFX9-NEXT: v_mov_b32_e32 v6, s13 8810; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8811; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc 8812; GFX9-NEXT: v_mov_b32_e32 v5, s12 8813; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 8814; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5] 8815; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 8816; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 8817; GFX9-NEXT: v_mov_b32_e32 v6, s1 8818; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 8819; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc 8820; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8821; GFX9-NEXT: s_endpgm 8822 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 8823 %r = sdiv <2 x i64> %x, %shl.y 8824 store <2 x i64> %r, ptr addrspace(1) %out 8825 ret void 8826} 8827 8828define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) { 8829; CHECK-LABEL: @srem_i64_oddk_denom( 8830; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 8831; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 8832; CHECK-NEXT: ret void 8833; 8834; GFX6-LABEL: srem_i64_oddk_denom: 8835; GFX6: ; %bb.0: 8836; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 8837; GFX6-NEXT: v_mov_b32_e32 v2, 0xfd81e19 8838; GFX6-NEXT: v_mov_b32_e32 v0, 0x6ca94220 8839; GFX6-NEXT: s_mov_b32 s3, 0xf000 8840; GFX6-NEXT: s_mov_b32 s2, -1 8841; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8842; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 8843; GFX6-NEXT: v_mul_hi_u32 v4, s7, v2 8844; GFX6-NEXT: s_mov_b32 s0, s4 8845; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 8846; GFX6-NEXT: s_mul_i32 s4, s7, 0xfd81e19 8847; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 8848; GFX6-NEXT: s_mul_i32 s1, s6, 0x6ca94220 8849; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8850; GFX6-NEXT: s_ashr_i32 s4, s7, 31 8851; GFX6-NEXT: v_add_i32_e32 v3, vcc, s1, v3 8852; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 8853; GFX6-NEXT: v_mul_hi_u32 v2, s4, v2 8854; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8855; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 8856; GFX6-NEXT: s_mul_i32 s1, s7, 0x6ca94220 8857; GFX6-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, vcc 8858; GFX6-NEXT: v_add_i32_e32 v1, vcc, s1, v1 8859; GFX6-NEXT: s_mul_i32 s1, s4, 0x6ca94220 8860; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc 8861; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v2 8862; GFX6-NEXT: s_mul_i32 s4, s4, 0xfd81e19 8863; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v0 8864; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1 8865; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc 8866; GFX6-NEXT: v_ashr_i64 v[2:3], v[0:1], 19 8867; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 8868; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 8869; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 8870; GFX6-NEXT: s_mov_b32 s4, 0x12d8fb 8871; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 8872; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4 8873; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 8874; GFX6-NEXT: s_mov_b32 s1, s5 8875; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 8876; GFX6-NEXT: v_mov_b32_e32 v2, s7 8877; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 8878; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 8879; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8880; GFX6-NEXT: s_endpgm 8881; 8882; GFX9-LABEL: srem_i64_oddk_denom: 8883; GFX9: ; %bb.0: 8884; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8885; GFX9-NEXT: v_mov_b32_e32 v2, 0 8886; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8887; GFX9-NEXT: s_mul_i32 s7, s3, 0xfd81e19 8888; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0xfd81e19 8889; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0xfd81e19 8890; GFX9-NEXT: s_add_u32 s7, s7, s8 8891; GFX9-NEXT: s_mul_i32 s5, s2, 0x6ca94220 8892; GFX9-NEXT: s_addc_u32 s6, s6, 0 8893; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0x6ca94220 8894; GFX9-NEXT: s_add_u32 s5, s5, s7 8895; GFX9-NEXT: s_addc_u32 s4, s4, 0 8896; GFX9-NEXT: s_add_u32 s4, s6, s4 8897; GFX9-NEXT: s_addc_u32 s5, 0, 0 8898; GFX9-NEXT: s_mul_i32 s7, s3, 0x6ca94220 8899; GFX9-NEXT: s_mul_hi_u32 s6, s3, 0x6ca94220 8900; GFX9-NEXT: s_add_u32 s4, s7, s4 8901; GFX9-NEXT: s_addc_u32 s5, s6, s5 8902; GFX9-NEXT: s_ashr_i32 s6, s3, 31 8903; GFX9-NEXT: s_mul_i32 s7, s6, 0x6ca94220 8904; GFX9-NEXT: s_mul_hi_u32 s8, s6, 0xfd81e19 8905; GFX9-NEXT: s_add_i32 s7, s8, s7 8906; GFX9-NEXT: s_mul_i32 s6, s6, 0xfd81e19 8907; GFX9-NEXT: s_add_i32 s7, s7, s6 8908; GFX9-NEXT: s_add_u32 s4, s4, s6 8909; GFX9-NEXT: s_addc_u32 s5, s5, s7 8910; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 19 8911; GFX9-NEXT: s_lshr_b32 s4, s5, 31 8912; GFX9-NEXT: s_add_u32 s4, s6, s4 8913; GFX9-NEXT: s_addc_u32 s5, s7, 0 8914; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb 8915; GFX9-NEXT: s_mul_hi_u32 s6, s4, 0x12d8fb 8916; GFX9-NEXT: s_add_i32 s6, s6, s5 8917; GFX9-NEXT: s_mul_i32 s4, s4, 0x12d8fb 8918; GFX9-NEXT: s_sub_u32 s2, s2, s4 8919; GFX9-NEXT: s_subb_u32 s3, s3, s6 8920; GFX9-NEXT: v_mov_b32_e32 v0, s2 8921; GFX9-NEXT: v_mov_b32_e32 v1, s3 8922; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8923; GFX9-NEXT: s_endpgm 8924 %r = srem i64 %x, 1235195 8925 store i64 %r, ptr addrspace(1) %out 8926 ret void 8927} 8928 8929define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) { 8930; CHECK-LABEL: @srem_i64_pow2k_denom( 8931; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 8932; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 8933; CHECK-NEXT: ret void 8934; 8935; GFX6-LABEL: srem_i64_pow2k_denom: 8936; GFX6: ; %bb.0: 8937; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8938; GFX6-NEXT: s_mov_b32 s7, 0xf000 8939; GFX6-NEXT: s_mov_b32 s6, -1 8940; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8941; GFX6-NEXT: s_mov_b32 s4, s0 8942; GFX6-NEXT: s_ashr_i32 s0, s3, 31 8943; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8944; GFX6-NEXT: s_add_u32 s0, s2, s0 8945; GFX6-NEXT: s_mov_b32 s5, s1 8946; GFX6-NEXT: s_addc_u32 s1, s3, 0 8947; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 8948; GFX6-NEXT: s_sub_u32 s0, s2, s0 8949; GFX6-NEXT: s_subb_u32 s1, s3, s1 8950; GFX6-NEXT: v_mov_b32_e32 v0, s0 8951; GFX6-NEXT: v_mov_b32_e32 v1, s1 8952; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8953; GFX6-NEXT: s_endpgm 8954; 8955; GFX9-LABEL: srem_i64_pow2k_denom: 8956; GFX9: ; %bb.0: 8957; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 8958; GFX9-NEXT: v_mov_b32_e32 v2, 0 8959; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8960; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8961; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8962; GFX9-NEXT: s_add_u32 s4, s2, s4 8963; GFX9-NEXT: s_addc_u32 s5, s3, 0 8964; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 8965; GFX9-NEXT: s_sub_u32 s2, s2, s4 8966; GFX9-NEXT: s_subb_u32 s3, s3, s5 8967; GFX9-NEXT: v_mov_b32_e32 v0, s2 8968; GFX9-NEXT: v_mov_b32_e32 v1, s3 8969; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8970; GFX9-NEXT: s_endpgm 8971 %r = srem i64 %x, 4096 8972 store i64 %r, ptr addrspace(1) %out 8973 ret void 8974} 8975 8976define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x, i64 %y) { 8977; CHECK-LABEL: @srem_i64_pow2_shl_denom( 8978; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8979; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 8980; CHECK-NEXT: store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8 8981; CHECK-NEXT: ret void 8982; 8983; GFX6-LABEL: srem_i64_pow2_shl_denom: 8984; GFX6: ; %bb.0: 8985; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd 8986; GFX6-NEXT: s_mov_b32 s7, 0xf000 8987; GFX6-NEXT: s_mov_b32 s6, -1 8988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8989; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 8990; GFX6-NEXT: s_ashr_i32 s2, s1, 31 8991; GFX6-NEXT: s_add_u32 s0, s0, s2 8992; GFX6-NEXT: s_mov_b32 s3, s2 8993; GFX6-NEXT: s_addc_u32 s1, s1, s2 8994; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 8995; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 8996; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 8997; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 8998; GFX6-NEXT: s_sub_u32 s4, 0, s8 8999; GFX6-NEXT: s_subb_u32 s5, 0, s9 9000; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 9001; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9002; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9003; GFX6-NEXT: s_ashr_i32 s10, s3, 31 9004; GFX6-NEXT: s_add_u32 s2, s2, s10 9005; GFX6-NEXT: s_mov_b32 s11, s10 9006; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9007; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9008; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9009; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 9010; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9011; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9012; GFX6-NEXT: s_addc_u32 s3, s3, s10 9013; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 9014; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9015; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9016; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 9017; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 9018; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9019; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9020; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9021; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9022; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 9023; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9024; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9025; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9026; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9027; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9028; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9029; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9030; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9031; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 9032; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9033; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9034; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9035; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9036; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9037; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9038; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 9039; GFX6-NEXT: s_mov_b32 s5, s1 9040; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9041; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 9042; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9043; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9044; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9045; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9046; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9047; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9048; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9049; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9050; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9051; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9052; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9053; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9054; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9055; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9056; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9057; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9058; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9059; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 9060; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 9061; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 9062; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 9063; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 9064; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9065; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9066; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 9067; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 9068; GFX6-NEXT: s_mov_b32 s4, s0 9069; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9070; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9071; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9072; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9073; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9074; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 9075; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 9076; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 9077; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 9078; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 9079; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 9080; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 9081; GFX6-NEXT: v_mov_b32_e32 v3, s9 9082; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 9083; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 9084; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 9085; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 9086; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 9087; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 9088; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 9089; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 9090; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 9091; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 9092; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 9093; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 9094; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 9095; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9096; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 9097; GFX6-NEXT: v_mov_b32_e32 v4, s13 9098; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc 9099; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 9100; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9101; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 9102; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 9103; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9104; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 9105; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 9106; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9107; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9108; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9109; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 9110; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 9111; GFX6-NEXT: v_mov_b32_e32 v2, s10 9112; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 9113; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9114; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9115; GFX6-NEXT: s_endpgm 9116; 9117; GFX9-LABEL: srem_i64_pow2_shl_denom: 9118; GFX9: ; %bb.0: 9119; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 9120; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 9121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9122; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 9123; GFX9-NEXT: s_ashr_i32 s2, s1, 31 9124; GFX9-NEXT: s_add_u32 s0, s0, s2 9125; GFX9-NEXT: s_mov_b32 s3, s2 9126; GFX9-NEXT: s_addc_u32 s1, s1, s2 9127; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] 9128; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 9129; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 9130; GFX9-NEXT: s_sub_u32 s0, 0, s6 9131; GFX9-NEXT: s_subb_u32 s1, 0, s7 9132; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 9133; GFX9-NEXT: v_rcp_f32_e32 v1, v0 9134; GFX9-NEXT: v_mov_b32_e32 v0, 0 9135; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 9136; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 9137; GFX9-NEXT: v_trunc_f32_e32 v2, v2 9138; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 9139; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 9140; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9141; GFX9-NEXT: v_readfirstlane_b32 s2, v2 9142; GFX9-NEXT: v_readfirstlane_b32 s3, v1 9143; GFX9-NEXT: s_mul_i32 s4, s0, s2 9144; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 9145; GFX9-NEXT: s_mul_i32 s5, s1, s3 9146; GFX9-NEXT: s_add_i32 s4, s12, s4 9147; GFX9-NEXT: s_mul_i32 s13, s0, s3 9148; GFX9-NEXT: s_add_i32 s4, s4, s5 9149; GFX9-NEXT: s_mul_hi_u32 s12, s3, s13 9150; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 9151; GFX9-NEXT: s_mul_i32 s3, s3, s4 9152; GFX9-NEXT: s_add_u32 s3, s12, s3 9153; GFX9-NEXT: s_addc_u32 s5, 0, s5 9154; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 9155; GFX9-NEXT: s_mul_i32 s13, s2, s13 9156; GFX9-NEXT: s_add_u32 s3, s3, s13 9157; GFX9-NEXT: s_mul_hi_u32 s12, s2, s4 9158; GFX9-NEXT: s_addc_u32 s3, s5, s14 9159; GFX9-NEXT: s_addc_u32 s5, s12, 0 9160; GFX9-NEXT: s_mul_i32 s4, s2, s4 9161; GFX9-NEXT: s_add_u32 s3, s3, s4 9162; GFX9-NEXT: s_addc_u32 s4, 0, s5 9163; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 9164; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9165; GFX9-NEXT: s_addc_u32 s2, s2, s4 9166; GFX9-NEXT: v_readfirstlane_b32 s4, v1 9167; GFX9-NEXT: s_mul_i32 s3, s0, s2 9168; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 9169; GFX9-NEXT: s_add_i32 s3, s5, s3 9170; GFX9-NEXT: s_mul_i32 s1, s1, s4 9171; GFX9-NEXT: s_add_i32 s3, s3, s1 9172; GFX9-NEXT: s_mul_i32 s0, s0, s4 9173; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 9174; GFX9-NEXT: s_mul_i32 s12, s2, s0 9175; GFX9-NEXT: s_mul_i32 s14, s4, s3 9176; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 9177; GFX9-NEXT: s_mul_hi_u32 s13, s4, s3 9178; GFX9-NEXT: s_add_u32 s0, s0, s14 9179; GFX9-NEXT: s_addc_u32 s4, 0, s13 9180; GFX9-NEXT: s_add_u32 s0, s0, s12 9181; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 9182; GFX9-NEXT: s_addc_u32 s0, s4, s5 9183; GFX9-NEXT: s_addc_u32 s1, s1, 0 9184; GFX9-NEXT: s_mul_i32 s3, s2, s3 9185; GFX9-NEXT: s_add_u32 s0, s0, s3 9186; GFX9-NEXT: s_addc_u32 s1, 0, s1 9187; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 9188; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9189; GFX9-NEXT: s_addc_u32 s2, s2, s1 9190; GFX9-NEXT: s_ashr_i32 s4, s11, 31 9191; GFX9-NEXT: s_add_u32 s0, s10, s4 9192; GFX9-NEXT: s_mov_b32 s5, s4 9193; GFX9-NEXT: s_addc_u32 s1, s11, s4 9194; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] 9195; GFX9-NEXT: v_readfirstlane_b32 s3, v1 9196; GFX9-NEXT: s_mul_i32 s1, s10, s2 9197; GFX9-NEXT: s_mul_hi_u32 s5, s10, s3 9198; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 9199; GFX9-NEXT: s_add_u32 s1, s5, s1 9200; GFX9-NEXT: s_addc_u32 s0, 0, s0 9201; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 9202; GFX9-NEXT: s_mul_i32 s3, s11, s3 9203; GFX9-NEXT: s_add_u32 s1, s1, s3 9204; GFX9-NEXT: s_mul_hi_u32 s5, s11, s2 9205; GFX9-NEXT: s_addc_u32 s0, s0, s12 9206; GFX9-NEXT: s_addc_u32 s1, s5, 0 9207; GFX9-NEXT: s_mul_i32 s2, s11, s2 9208; GFX9-NEXT: s_add_u32 s0, s0, s2 9209; GFX9-NEXT: s_addc_u32 s1, 0, s1 9210; GFX9-NEXT: s_mul_i32 s1, s6, s1 9211; GFX9-NEXT: s_mul_hi_u32 s2, s6, s0 9212; GFX9-NEXT: s_add_i32 s1, s2, s1 9213; GFX9-NEXT: s_mul_i32 s2, s7, s0 9214; GFX9-NEXT: s_mul_i32 s0, s6, s0 9215; GFX9-NEXT: s_add_i32 s5, s1, s2 9216; GFX9-NEXT: v_mov_b32_e32 v1, s0 9217; GFX9-NEXT: s_sub_i32 s1, s11, s5 9218; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 9219; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9220; GFX9-NEXT: s_subb_u32 s10, s1, s7 9221; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 9222; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9223; GFX9-NEXT: s_subb_u32 s12, s10, 0 9224; GFX9-NEXT: s_cmp_ge_u32 s12, s7 9225; GFX9-NEXT: s_cselect_b32 s13, -1, 0 9226; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v2 9227; GFX9-NEXT: s_cmp_eq_u32 s12, s7 9228; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] 9229; GFX9-NEXT: v_mov_b32_e32 v4, s13 9230; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 9231; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9232; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] 9233; GFX9-NEXT: s_subb_u32 s2, s10, s7 9234; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s6, v2 9235; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9236; GFX9-NEXT: s_subb_u32 s2, s2, 0 9237; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 9238; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 9239; GFX9-NEXT: v_mov_b32_e32 v3, s12 9240; GFX9-NEXT: v_mov_b32_e32 v4, s2 9241; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9242; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 9243; GFX9-NEXT: s_subb_u32 s0, s11, s5 9244; GFX9-NEXT: s_cmp_ge_u32 s0, s7 9245; GFX9-NEXT: s_cselect_b32 s1, -1, 0 9246; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 9247; GFX9-NEXT: s_cmp_eq_u32 s0, s7 9248; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9249; GFX9-NEXT: v_mov_b32_e32 v5, s1 9250; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9251; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc 9252; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9253; GFX9-NEXT: v_mov_b32_e32 v5, s0 9254; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9255; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 9256; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 9257; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 9258; GFX9-NEXT: v_mov_b32_e32 v3, s4 9259; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s4, v1 9260; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc 9261; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] 9262; GFX9-NEXT: s_endpgm 9263 %shl.y = shl i64 4096, %y 9264 %r = srem i64 %x, %shl.y 9265 store i64 %r, ptr addrspace(1) %out 9266 ret void 9267} 9268 9269define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i64> %x) { 9270; CHECK-LABEL: @srem_v2i64_pow2k_denom( 9271; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9272; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 9273; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0 9274; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9275; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 9276; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9277; CHECK-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[OUT:%.*]], align 16 9278; CHECK-NEXT: ret void 9279; 9280; GFX6-LABEL: srem_v2i64_pow2k_denom: 9281; GFX6: ; %bb.0: 9282; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 9283; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 9284; GFX6-NEXT: s_mov_b32 s7, 0xf000 9285; GFX6-NEXT: s_mov_b32 s6, -1 9286; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9287; GFX6-NEXT: s_ashr_i32 s8, s1, 31 9288; GFX6-NEXT: s_lshr_b32 s8, s8, 20 9289; GFX6-NEXT: s_add_u32 s8, s0, s8 9290; GFX6-NEXT: s_addc_u32 s9, s1, 0 9291; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 9292; GFX6-NEXT: s_sub_u32 s0, s0, s8 9293; GFX6-NEXT: s_subb_u32 s1, s1, s9 9294; GFX6-NEXT: s_ashr_i32 s8, s3, 31 9295; GFX6-NEXT: s_lshr_b32 s8, s8, 20 9296; GFX6-NEXT: s_add_u32 s8, s2, s8 9297; GFX6-NEXT: s_addc_u32 s9, s3, 0 9298; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 9299; GFX6-NEXT: s_sub_u32 s2, s2, s8 9300; GFX6-NEXT: s_subb_u32 s3, s3, s9 9301; GFX6-NEXT: v_mov_b32_e32 v0, s0 9302; GFX6-NEXT: v_mov_b32_e32 v1, s1 9303; GFX6-NEXT: v_mov_b32_e32 v2, s2 9304; GFX6-NEXT: v_mov_b32_e32 v3, s3 9305; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9306; GFX6-NEXT: s_endpgm 9307; 9308; GFX9-LABEL: srem_v2i64_pow2k_denom: 9309; GFX9: ; %bb.0: 9310; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 9311; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 9312; GFX9-NEXT: v_mov_b32_e32 v4, 0 9313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9314; GFX9-NEXT: s_ashr_i32 s4, s1, 31 9315; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9316; GFX9-NEXT: s_add_u32 s4, s0, s4 9317; GFX9-NEXT: s_addc_u32 s5, s1, 0 9318; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9319; GFX9-NEXT: s_sub_u32 s0, s0, s4 9320; GFX9-NEXT: s_subb_u32 s1, s1, s5 9321; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9322; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9323; GFX9-NEXT: s_add_u32 s4, s2, s4 9324; GFX9-NEXT: s_addc_u32 s5, s3, 0 9325; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9326; GFX9-NEXT: s_sub_u32 s2, s2, s4 9327; GFX9-NEXT: s_subb_u32 s3, s3, s5 9328; GFX9-NEXT: v_mov_b32_e32 v0, s0 9329; GFX9-NEXT: v_mov_b32_e32 v1, s1 9330; GFX9-NEXT: v_mov_b32_e32 v2, s2 9331; GFX9-NEXT: v_mov_b32_e32 v3, s3 9332; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 9333; GFX9-NEXT: s_endpgm 9334 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 9335 store <2 x i64> %r, ptr addrspace(1) %out 9336 ret void 9337} 9338 9339define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x i64> %x, <2 x i64> %y) { 9340; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 9341; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> splat (i64 4096), [[Y:%.*]] 9342; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9343; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 9344; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 9345; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0 9346; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 9347; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 9348; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 9349; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 9350; CHECK-NEXT: store <2 x i64> [[TMP8]], ptr addrspace(1) [[OUT:%.*]], align 16 9351; CHECK-NEXT: ret void 9352; 9353; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 9354; GFX6: ; %bb.0: 9355; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 9356; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 9357; GFX6-NEXT: s_mov_b32 s7, 0xf000 9358; GFX6-NEXT: s_mov_b32 s6, -1 9359; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9360; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 9361; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s14 9362; GFX6-NEXT: s_ashr_i32 s2, s1, 31 9363; GFX6-NEXT: s_add_u32 s0, s0, s2 9364; GFX6-NEXT: s_mov_b32 s3, s2 9365; GFX6-NEXT: s_addc_u32 s1, s1, s2 9366; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[2:3] 9367; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 9368; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 9369; GFX6-NEXT: s_sub_u32 s0, 0, s14 9370; GFX6-NEXT: s_subb_u32 s1, 0, s15 9371; GFX6-NEXT: s_ashr_i32 s12, s9, 31 9372; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9373; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9374; GFX6-NEXT: s_mov_b32 s13, s12 9375; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9376; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9377; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9378; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9379; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9380; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9381; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 9382; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 9383; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 9384; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 9385; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9386; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9387; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9388; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9389; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 9390; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 9391; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 9392; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9393; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9394; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9395; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9396; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 9397; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc 9398; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 9399; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9400; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9401; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9402; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9403; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 9404; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 9405; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 9406; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9407; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 9408; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9409; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9410; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9411; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9412; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9413; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9414; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9415; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9416; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9417; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9418; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9419; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9420; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9421; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9422; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9423; GFX6-NEXT: s_add_u32 s0, s8, s12 9424; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9425; GFX6-NEXT: s_addc_u32 s1, s9, s12 9426; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9427; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 9428; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 9429; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 9430; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 9431; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 9432; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 9433; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9434; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9435; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 9436; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 9437; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9438; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9439; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9440; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9441; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9442; GFX6-NEXT: v_mul_lo_u32 v1, s14, v1 9443; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 9444; GFX6-NEXT: v_mul_lo_u32 v3, s15, v0 9445; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0 9446; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 9447; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 9448; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 9449; GFX6-NEXT: v_mov_b32_e32 v3, s15 9450; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 9451; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 9452; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0 9453; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 9454; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v5 9455; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 9456; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v4 9457; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 9458; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 9459; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v5 9460; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s14, v4 9461; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 9462; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 9463; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9464; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 9465; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 9466; GFX6-NEXT: s_ashr_i32 s0, s17, 31 9467; GFX6-NEXT: s_add_u32 s2, s16, s0 9468; GFX6-NEXT: s_mov_b32 s1, s0 9469; GFX6-NEXT: s_addc_u32 s3, s17, s0 9470; GFX6-NEXT: v_mov_b32_e32 v4, s9 9471; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1] 9472; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc 9473; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 9474; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9 9475; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 9476; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9477; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 9478; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 9479; GFX6-NEXT: v_rcp_f32_e32 v4, v4 9480; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9481; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 9482; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc 9483; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9484; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9485; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 9486; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 9487; GFX6-NEXT: v_trunc_f32_e32 v4, v4 9488; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 9489; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 9490; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 9491; GFX6-NEXT: s_sub_u32 s0, 0, s8 9492; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9493; GFX6-NEXT: v_mul_hi_u32 v3, s0, v2 9494; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 9495; GFX6-NEXT: s_subb_u32 s1, 0, s9 9496; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 9497; GFX6-NEXT: s_ashr_i32 s14, s11, 31 9498; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9499; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 9500; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9501; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3 9502; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 9503; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3 9504; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 9505; GFX6-NEXT: v_mul_lo_u32 v3, v4, v3 9506; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9507; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9508; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 9509; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 9510; GFX6-NEXT: s_mov_b32 s15, s14 9511; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 9512; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 9513; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 9514; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 9515; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9516; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9517; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9518; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 9519; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 9520; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 9521; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 9522; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 9523; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 9524; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 9525; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9526; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 9527; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 9528; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 9529; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 9530; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 9531; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 9532; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9533; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 9534; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 9535; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9536; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 9537; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 9538; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9539; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9540; GFX6-NEXT: s_add_u32 s0, s10, s14 9541; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9542; GFX6-NEXT: s_addc_u32 s1, s11, s14 9543; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 9544; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 9545; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 9546; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 9547; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 9548; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 9549; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 9550; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9551; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9552; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 9553; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 9554; GFX6-NEXT: v_mov_b32_e32 v6, s12 9555; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 9556; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 9557; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 9558; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9559; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9560; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 9561; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 9562; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 9563; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 9564; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 9565; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 9566; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 9567; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9568; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 9569; GFX6-NEXT: v_mov_b32_e32 v5, s9 9570; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 9571; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9572; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 9573; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 9574; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 9575; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 9576; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 9577; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 9578; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 9579; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 9580; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 9581; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 9582; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9583; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 9584; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] 9585; GFX6-NEXT: v_mov_b32_e32 v6, s11 9586; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc 9587; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 9588; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9589; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 9590; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 9591; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9592; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 9593; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 9594; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9595; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 9596; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 9597; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 9598; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 9599; GFX6-NEXT: v_mov_b32_e32 v4, s14 9600; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 9601; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 9602; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9603; GFX6-NEXT: s_endpgm 9604; 9605; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 9606; GFX9: ; %bb.0: 9607; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 9608; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 9609; GFX9-NEXT: v_mov_b32_e32 v4, 0 9610; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9611; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 9612; GFX9-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 9613; GFX9-NEXT: s_ashr_i32 s2, s1, 31 9614; GFX9-NEXT: s_add_u32 s0, s0, s2 9615; GFX9-NEXT: s_mov_b32 s3, s2 9616; GFX9-NEXT: s_addc_u32 s1, s1, s2 9617; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] 9618; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 9619; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 9620; GFX9-NEXT: s_sub_u32 s0, 0, s12 9621; GFX9-NEXT: s_subb_u32 s1, 0, s13 9622; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9623; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9624; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9625; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9626; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9627; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9628; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9629; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9630; GFX9-NEXT: v_readfirstlane_b32 s2, v1 9631; GFX9-NEXT: v_readfirstlane_b32 s3, v0 9632; GFX9-NEXT: s_mul_i32 s4, s0, s2 9633; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 9634; GFX9-NEXT: s_mul_i32 s5, s1, s3 9635; GFX9-NEXT: s_add_i32 s4, s16, s4 9636; GFX9-NEXT: s_mul_i32 s17, s0, s3 9637; GFX9-NEXT: s_add_i32 s4, s4, s5 9638; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 9639; GFX9-NEXT: s_mul_i32 s16, s3, s4 9640; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 9641; GFX9-NEXT: s_add_u32 s3, s3, s16 9642; GFX9-NEXT: s_addc_u32 s5, 0, s5 9643; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17 9644; GFX9-NEXT: s_mul_i32 s17, s2, s17 9645; GFX9-NEXT: s_add_u32 s3, s3, s17 9646; GFX9-NEXT: s_mul_hi_u32 s16, s2, s4 9647; GFX9-NEXT: s_addc_u32 s3, s5, s18 9648; GFX9-NEXT: s_addc_u32 s5, s16, 0 9649; GFX9-NEXT: s_mul_i32 s4, s2, s4 9650; GFX9-NEXT: s_add_u32 s3, s3, s4 9651; GFX9-NEXT: s_addc_u32 s4, 0, s5 9652; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 9653; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9654; GFX9-NEXT: s_addc_u32 s2, s2, s4 9655; GFX9-NEXT: v_readfirstlane_b32 s4, v0 9656; GFX9-NEXT: s_mul_i32 s3, s0, s2 9657; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 9658; GFX9-NEXT: s_add_i32 s3, s5, s3 9659; GFX9-NEXT: s_mul_i32 s1, s1, s4 9660; GFX9-NEXT: s_add_i32 s3, s3, s1 9661; GFX9-NEXT: s_mul_i32 s0, s0, s4 9662; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 9663; GFX9-NEXT: s_mul_i32 s16, s2, s0 9664; GFX9-NEXT: s_mul_i32 s18, s4, s3 9665; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 9666; GFX9-NEXT: s_mul_hi_u32 s17, s4, s3 9667; GFX9-NEXT: s_add_u32 s0, s0, s18 9668; GFX9-NEXT: s_addc_u32 s4, 0, s17 9669; GFX9-NEXT: s_add_u32 s0, s0, s16 9670; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 9671; GFX9-NEXT: s_addc_u32 s0, s4, s5 9672; GFX9-NEXT: s_addc_u32 s1, s1, 0 9673; GFX9-NEXT: s_mul_i32 s3, s2, s3 9674; GFX9-NEXT: s_add_u32 s0, s0, s3 9675; GFX9-NEXT: s_addc_u32 s1, 0, s1 9676; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 9677; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9678; GFX9-NEXT: s_addc_u32 s2, s2, s1 9679; GFX9-NEXT: s_ashr_i32 s16, s9, 31 9680; GFX9-NEXT: s_add_u32 s0, s8, s16 9681; GFX9-NEXT: s_mov_b32 s17, s16 9682; GFX9-NEXT: s_addc_u32 s1, s9, s16 9683; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] 9684; GFX9-NEXT: v_readfirstlane_b32 s3, v0 9685; GFX9-NEXT: s_mul_i32 s1, s4, s2 9686; GFX9-NEXT: s_mul_hi_u32 s8, s4, s3 9687; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2 9688; GFX9-NEXT: s_add_u32 s1, s8, s1 9689; GFX9-NEXT: s_addc_u32 s0, 0, s0 9690; GFX9-NEXT: s_mul_hi_u32 s9, s5, s3 9691; GFX9-NEXT: s_mul_i32 s3, s5, s3 9692; GFX9-NEXT: s_add_u32 s1, s1, s3 9693; GFX9-NEXT: s_mul_hi_u32 s8, s5, s2 9694; GFX9-NEXT: s_addc_u32 s0, s0, s9 9695; GFX9-NEXT: s_addc_u32 s1, s8, 0 9696; GFX9-NEXT: s_mul_i32 s2, s5, s2 9697; GFX9-NEXT: s_add_u32 s0, s0, s2 9698; GFX9-NEXT: s_addc_u32 s1, 0, s1 9699; GFX9-NEXT: s_mul_i32 s1, s12, s1 9700; GFX9-NEXT: s_mul_hi_u32 s2, s12, s0 9701; GFX9-NEXT: s_add_i32 s1, s2, s1 9702; GFX9-NEXT: s_mul_i32 s2, s13, s0 9703; GFX9-NEXT: s_mul_i32 s0, s12, s0 9704; GFX9-NEXT: s_add_i32 s8, s1, s2 9705; GFX9-NEXT: v_mov_b32_e32 v0, s0 9706; GFX9-NEXT: s_sub_i32 s1, s5, s8 9707; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 9708; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9709; GFX9-NEXT: s_subb_u32 s4, s1, s13 9710; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 9711; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9712; GFX9-NEXT: s_subb_u32 s9, s4, 0 9713; GFX9-NEXT: s_cmp_ge_u32 s9, s13 9714; GFX9-NEXT: s_cselect_b32 s17, -1, 0 9715; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 9716; GFX9-NEXT: s_cmp_eq_u32 s9, s13 9717; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] 9718; GFX9-NEXT: v_mov_b32_e32 v3, s17 9719; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 9720; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9721; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] 9722; GFX9-NEXT: s_subb_u32 s2, s4, s13 9723; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 9724; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9725; GFX9-NEXT: s_subb_u32 s2, s2, 0 9726; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 9727; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9728; GFX9-NEXT: v_mov_b32_e32 v2, s9 9729; GFX9-NEXT: v_mov_b32_e32 v3, s2 9730; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9731; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 9732; GFX9-NEXT: s_subb_u32 s0, s5, s8 9733; GFX9-NEXT: s_cmp_ge_u32 s0, s13 9734; GFX9-NEXT: s_cselect_b32 s1, -1, 0 9735; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 9736; GFX9-NEXT: s_cmp_eq_u32 s0, s13 9737; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9738; GFX9-NEXT: v_mov_b32_e32 v5, s1 9739; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9740; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 9741; GFX9-NEXT: v_mov_b32_e32 v5, s0 9742; GFX9-NEXT: s_ashr_i32 s0, s15, 31 9743; GFX9-NEXT: s_add_u32 s2, s14, s0 9744; GFX9-NEXT: s_mov_b32 s1, s0 9745; GFX9-NEXT: s_addc_u32 s3, s15, s0 9746; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9747; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] 9748; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 9749; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 9750; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 9751; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc 9752; GFX9-NEXT: v_xor_b32_e32 v0, s16, v0 9753; GFX9-NEXT: v_xor_b32_e32 v2, s16, v2 9754; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 9755; GFX9-NEXT: v_rcp_f32_e32 v3, v1 9756; GFX9-NEXT: v_mov_b32_e32 v5, s16 9757; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v0 9758; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc 9759; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 9760; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 9761; GFX9-NEXT: v_trunc_f32_e32 v3, v3 9762; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 9763; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 9764; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 9765; GFX9-NEXT: s_sub_u32 s0, 0, s4 9766; GFX9-NEXT: s_subb_u32 s1, 0, s5 9767; GFX9-NEXT: v_readfirstlane_b32 s2, v2 9768; GFX9-NEXT: v_readfirstlane_b32 s9, v3 9769; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 9770; GFX9-NEXT: s_mul_i32 s12, s0, s9 9771; GFX9-NEXT: s_mul_i32 s3, s1, s2 9772; GFX9-NEXT: s_add_i32 s8, s8, s12 9773; GFX9-NEXT: s_add_i32 s8, s8, s3 9774; GFX9-NEXT: s_mul_i32 s13, s0, s2 9775; GFX9-NEXT: s_mul_hi_u32 s3, s2, s8 9776; GFX9-NEXT: s_mul_i32 s12, s2, s8 9777; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13 9778; GFX9-NEXT: s_add_u32 s2, s2, s12 9779; GFX9-NEXT: s_addc_u32 s3, 0, s3 9780; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 9781; GFX9-NEXT: s_mul_i32 s13, s9, s13 9782; GFX9-NEXT: s_add_u32 s2, s2, s13 9783; GFX9-NEXT: s_mul_hi_u32 s12, s9, s8 9784; GFX9-NEXT: s_addc_u32 s2, s3, s14 9785; GFX9-NEXT: s_addc_u32 s3, s12, 0 9786; GFX9-NEXT: s_mul_i32 s8, s9, s8 9787; GFX9-NEXT: s_add_u32 s2, s2, s8 9788; GFX9-NEXT: s_addc_u32 s3, 0, s3 9789; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 9790; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9791; GFX9-NEXT: s_addc_u32 s2, s9, s3 9792; GFX9-NEXT: v_readfirstlane_b32 s8, v2 9793; GFX9-NEXT: s_mul_i32 s3, s0, s2 9794; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 9795; GFX9-NEXT: s_add_i32 s3, s9, s3 9796; GFX9-NEXT: s_mul_i32 s1, s1, s8 9797; GFX9-NEXT: s_add_i32 s3, s3, s1 9798; GFX9-NEXT: s_mul_i32 s0, s0, s8 9799; GFX9-NEXT: s_mul_hi_u32 s9, s2, s0 9800; GFX9-NEXT: s_mul_i32 s12, s2, s0 9801; GFX9-NEXT: s_mul_i32 s14, s8, s3 9802; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 9803; GFX9-NEXT: s_mul_hi_u32 s13, s8, s3 9804; GFX9-NEXT: s_add_u32 s0, s0, s14 9805; GFX9-NEXT: s_addc_u32 s8, 0, s13 9806; GFX9-NEXT: s_add_u32 s0, s0, s12 9807; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 9808; GFX9-NEXT: s_addc_u32 s0, s8, s9 9809; GFX9-NEXT: s_addc_u32 s1, s1, 0 9810; GFX9-NEXT: s_mul_i32 s3, s2, s3 9811; GFX9-NEXT: s_add_u32 s0, s0, s3 9812; GFX9-NEXT: s_addc_u32 s1, 0, s1 9813; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 9814; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9815; GFX9-NEXT: s_addc_u32 s2, s2, s1 9816; GFX9-NEXT: s_ashr_i32 s8, s11, 31 9817; GFX9-NEXT: s_add_u32 s0, s10, s8 9818; GFX9-NEXT: s_mov_b32 s9, s8 9819; GFX9-NEXT: s_addc_u32 s1, s11, s8 9820; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] 9821; GFX9-NEXT: v_readfirstlane_b32 s3, v2 9822; GFX9-NEXT: s_mul_i32 s1, s10, s2 9823; GFX9-NEXT: s_mul_hi_u32 s9, s10, s3 9824; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 9825; GFX9-NEXT: s_add_u32 s1, s9, s1 9826; GFX9-NEXT: s_addc_u32 s0, 0, s0 9827; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 9828; GFX9-NEXT: s_mul_i32 s3, s11, s3 9829; GFX9-NEXT: s_add_u32 s1, s1, s3 9830; GFX9-NEXT: s_mul_hi_u32 s9, s11, s2 9831; GFX9-NEXT: s_addc_u32 s0, s0, s12 9832; GFX9-NEXT: s_addc_u32 s1, s9, 0 9833; GFX9-NEXT: s_mul_i32 s2, s11, s2 9834; GFX9-NEXT: s_add_u32 s0, s0, s2 9835; GFX9-NEXT: s_addc_u32 s1, 0, s1 9836; GFX9-NEXT: s_mul_i32 s1, s4, s1 9837; GFX9-NEXT: s_mul_hi_u32 s2, s4, s0 9838; GFX9-NEXT: s_add_i32 s1, s2, s1 9839; GFX9-NEXT: s_mul_i32 s2, s5, s0 9840; GFX9-NEXT: s_mul_i32 s0, s4, s0 9841; GFX9-NEXT: s_add_i32 s9, s1, s2 9842; GFX9-NEXT: v_mov_b32_e32 v2, s0 9843; GFX9-NEXT: s_sub_i32 s1, s11, s9 9844; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 9845; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9846; GFX9-NEXT: s_subb_u32 s10, s1, s5 9847; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 9848; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9849; GFX9-NEXT: s_subb_u32 s12, s10, 0 9850; GFX9-NEXT: s_cmp_ge_u32 s12, s5 9851; GFX9-NEXT: s_cselect_b32 s13, -1, 0 9852; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3 9853; GFX9-NEXT: s_cmp_eq_u32 s12, s5 9854; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] 9855; GFX9-NEXT: v_mov_b32_e32 v6, s13 9856; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 9857; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9858; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] 9859; GFX9-NEXT: s_subb_u32 s2, s10, s5 9860; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 9861; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9862; GFX9-NEXT: s_subb_u32 s2, s2, 0 9863; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9864; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 9865; GFX9-NEXT: v_mov_b32_e32 v5, s12 9866; GFX9-NEXT: v_mov_b32_e32 v6, s2 9867; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9868; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] 9869; GFX9-NEXT: s_subb_u32 s0, s11, s9 9870; GFX9-NEXT: s_cmp_ge_u32 s0, s5 9871; GFX9-NEXT: s_cselect_b32 s1, -1, 0 9872; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 9873; GFX9-NEXT: s_cmp_eq_u32 s0, s5 9874; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9875; GFX9-NEXT: v_mov_b32_e32 v7, s1 9876; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9877; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc 9878; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9879; GFX9-NEXT: v_mov_b32_e32 v7, s0 9880; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 9881; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc 9882; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 9883; GFX9-NEXT: v_xor_b32_e32 v3, s8, v5 9884; GFX9-NEXT: v_mov_b32_e32 v5, s8 9885; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v2 9886; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 9887; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] 9888; GFX9-NEXT: s_endpgm 9889 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 9890 %r = srem <2 x i64> %x, %shl.y 9891 store <2 x i64> %r, ptr addrspace(1) %out 9892 ret void 9893} 9894 9895define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) { 9896; CHECK-LABEL: @v_sdiv_i32_exact( 9897; CHECK: %1 = extractelement <2 x i32> %num, i64 0 9898; CHECK-NEXT: %2 = sdiv exact i32 %1, 4096 9899; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 9900; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 9901; CHECK-NEXT: %5 = sdiv exact i32 %4, 1024 9902; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 9903; CHECK-NEXT: ret <2 x i32> %6 9904; 9905; GFX6-LABEL: v_sdiv_i32_exact: 9906; GFX6: ; %bb.0: 9907; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9908; GFX6-NEXT: v_ashrrev_i32_e32 v0, 12, v0 9909; GFX6-NEXT: v_ashrrev_i32_e32 v1, 10, v1 9910; GFX6-NEXT: s_setpc_b64 s[30:31] 9911; 9912; GFX9-LABEL: v_sdiv_i32_exact: 9913; GFX9: ; %bb.0: 9914; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9915; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 9916; GFX9-NEXT: v_ashrrev_i32_e32 v1, 10, v1 9917; GFX9-NEXT: s_setpc_b64 s[30:31] 9918 %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024> 9919 ret <2 x i32> %result 9920} 9921 9922define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) { 9923; CHECK-LABEL: @v_sdiv_i64_exact( 9924; CHECK: %1 = extractelement <2 x i64> %num, i64 0 9925; CHECK-NEXT: %2 = sdiv exact i64 %1, 4096 9926; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 9927; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 9928; CHECK-NEXT: %5 = sdiv exact i64 %4, 1024 9929; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 9930; CHECK-NEXT: ret <2 x i64> %6 9931; 9932; GFX6-LABEL: v_sdiv_i64_exact: 9933; GFX6: ; %bb.0: 9934; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9935; GFX6-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 9936; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 9937; GFX6-NEXT: s_setpc_b64 s[30:31] 9938; 9939; GFX9-LABEL: v_sdiv_i64_exact: 9940; GFX9: ; %bb.0: 9941; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9942; GFX9-NEXT: v_ashrrev_i64 v[0:1], 12, v[0:1] 9943; GFX9-NEXT: v_ashrrev_i64 v[2:3], 10, v[2:3] 9944; GFX9-NEXT: s_setpc_b64 s[30:31] 9945 %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024> 9946 ret <2 x i64> %result 9947} 9948 9949define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) { 9950; CHECK-LABEL: @v_udiv_i32_exact( 9951; CHECK: %1 = extractelement <2 x i32> %num, i64 0 9952; CHECK-NEXT: %2 = udiv exact i32 %1, 4096 9953; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 9954; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 9955; CHECK-NEXT: %5 = udiv exact i32 %4, 1024 9956; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 9957; CHECK-NEXT: ret <2 x i32> %6 9958; 9959; GFX6-LABEL: v_udiv_i32_exact: 9960; GFX6: ; %bb.0: 9961; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9962; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v0 9963; GFX6-NEXT: v_lshrrev_b32_e32 v1, 10, v1 9964; GFX6-NEXT: s_setpc_b64 s[30:31] 9965; 9966; GFX9-LABEL: v_udiv_i32_exact: 9967; GFX9: ; %bb.0: 9968; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9969; GFX9-NEXT: v_lshrrev_b32_e32 v0, 12, v0 9970; GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v1 9971; GFX9-NEXT: s_setpc_b64 s[30:31] 9972 %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024> 9973 ret <2 x i32> %result 9974} 9975 9976define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) { 9977; CHECK-LABEL: @v_udiv_i64_exact( 9978; CHECK: %1 = extractelement <2 x i64> %num, i64 0 9979; CHECK-NEXT: %2 = udiv exact i64 %1, 4096 9980; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 9981; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 9982; CHECK-NEXT: %5 = udiv exact i64 %4, 1024 9983; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 9984; CHECK-NEXT: ret <2 x i64> %6 9985; 9986; GFX6-LABEL: v_udiv_i64_exact: 9987; GFX6: ; %bb.0: 9988; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9989; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 9990; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 10 9991; GFX6-NEXT: s_setpc_b64 s[30:31] 9992; 9993; GFX9-LABEL: v_udiv_i64_exact: 9994; GFX9: ; %bb.0: 9995; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9996; GFX9-NEXT: v_lshrrev_b64 v[0:1], 12, v[0:1] 9997; GFX9-NEXT: v_lshrrev_b64 v[2:3], 10, v[2:3] 9998; GFX9-NEXT: s_setpc_b64 s[30:31] 9999 %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024> 10000 ret <2 x i64> %result 10001} 10002 10003define i64 @udiv_i64_gt_smax(i8 %size) { 10004; GFX6-LABEL: udiv_i64_gt_smax: 10005; GFX6: ; %bb.0: 10006; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10007; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 10008; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 10009; GFX6-NEXT: v_not_b32_e32 v1, v1 10010; GFX6-NEXT: v_not_b32_e32 v0, v0 10011; GFX6-NEXT: s_mov_b32 s4, 0xcccccccd 10012; GFX6-NEXT: v_mul_lo_u32 v3, v1, s4 10013; GFX6-NEXT: v_mul_hi_u32 v4, v0, s4 10014; GFX6-NEXT: s_mov_b32 s6, 0xcccccccc 10015; GFX6-NEXT: v_mul_hi_u32 v5, v1, s4 10016; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 10017; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 10018; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 10019; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 10020; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 10021; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 10022; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 10023; GFX6-NEXT: v_mul_hi_u32 v1, v1, s6 10024; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 10025; GFX6-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc 10026; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 10027; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10028; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 3 10029; GFX6-NEXT: v_lshrrev_b32_e32 v1, 3, v1 10030; GFX6-NEXT: s_setpc_b64 s[30:31] 10031; 10032; GFX9-LABEL: udiv_i64_gt_smax: 10033; GFX9: ; %bb.0: 10034; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10035; GFX9-NEXT: v_mov_b32_e32 v1, 31 10036; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 10037; GFX9-NEXT: s_mov_b32 s4, 0xcccccccd 10038; GFX9-NEXT: v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 10039; GFX9-NEXT: v_mul_hi_u32 v0, v4, s4 10040; GFX9-NEXT: v_not_b32_e32 v5, v1 10041; GFX9-NEXT: v_mov_b32_e32 v1, 0 10042; GFX9-NEXT: s_mov_b32 s6, 0xcccccccc 10043; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, s4, v[0:1] 10044; GFX9-NEXT: v_mov_b32_e32 v6, v3 10045; GFX9-NEXT: v_mov_b32_e32 v3, v1 10046; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, s6, v[2:3] 10047; GFX9-NEXT: v_mov_b32_e32 v0, v1 10048; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 10049; GFX9-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, 0, vcc 10050; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, s6, v[0:1] 10051; GFX9-NEXT: v_alignbit_b32 v0, v1, v0, 3 10052; GFX9-NEXT: v_lshrrev_b32_e32 v1, 3, v1 10053; GFX9-NEXT: s_setpc_b64 s[30:31] 10054 %esize = sext i8 %size to i64 10055 %minus = sub nuw nsw i64 -1, %esize 10056 %div = udiv i64 %minus, 10 10057 ret i64 %div 10058} 10059 10060define i64 @udiv_i64_9divbits(i8 %size) { 10061; GFX6-LABEL: udiv_i64_9divbits: 10062; GFX6: ; %bb.0: 10063; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10064; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 10065; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 10066; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 10067; GFX6-NEXT: s_mov_b32 s4, 0x41200000 10068; GFX6-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0 10069; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10070; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 10071; GFX6-NEXT: v_mad_f32 v0, -v1, s4, v0 10072; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 10073; GFX6-NEXT: v_mov_b32_e32 v1, 0 10074; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 10075; GFX6-NEXT: v_and_b32_e32 v0, 0x1ff, v0 10076; GFX6-NEXT: s_setpc_b64 s[30:31] 10077; 10078; GFX9-LABEL: udiv_i64_9divbits: 10079; GFX9: ; %bb.0: 10080; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10081; GFX9-NEXT: v_mov_b32_e32 v1, 1 10082; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 10083; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 10084; GFX9-NEXT: s_mov_b32 s4, 0x41200000 10085; GFX9-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0 10086; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10087; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 10088; GFX9-NEXT: v_mad_f32 v0, -v1, s4, v0 10089; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4 10090; GFX9-NEXT: v_mov_b32_e32 v1, 0 10091; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc 10092; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff, v0 10093; GFX9-NEXT: s_setpc_b64 s[30:31] 10094 %zextend = zext i8 %size to i64 10095 %num = add nuw nsw i64 1, %zextend 10096 %div = udiv i64 %num, 10 10097 ret i64 %div 10098} 10099