1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL %s 3; RUN: llc -mtriple=amdgcn -stop-after=amdgpu-isel -enable-new-pm < %s | FileCheck -enable-var-scope -check-prefixes=GCN-ISEL %s 4 5; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CISI %s 6; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1010 %s 9; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W32 %s 10; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s 11; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s 12 13; GCN-ISEL-LABEL: name: sadd64rr 14; GCN-ISEL-LABEL: body: 15; GCN-ISEL-LABEL: bb.0.entry: 16; GCN-ISEL: S_ADD_U64_PSEUDO 17 18define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { 19; CISI-LABEL: sadd64rr: 20; CISI: ; %bb.0: ; %entry 21; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 22; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 23; CISI-NEXT: s_mov_b32 s7, 0xf000 24; CISI-NEXT: s_mov_b32 s6, -1 25; CISI-NEXT: s_waitcnt lgkmcnt(0) 26; CISI-NEXT: s_mov_b32 s4, s0 27; CISI-NEXT: s_add_u32 s0, s2, s8 28; CISI-NEXT: s_mov_b32 s5, s1 29; CISI-NEXT: s_addc_u32 s1, s3, s9 30; CISI-NEXT: v_mov_b32_e32 v0, s0 31; CISI-NEXT: v_mov_b32_e32 v1, s1 32; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 33; CISI-NEXT: s_endpgm 34; 35; VI-LABEL: sadd64rr: 36; VI: ; %bb.0: ; %entry 37; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 38; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 39; VI-NEXT: s_waitcnt lgkmcnt(0) 40; VI-NEXT: v_mov_b32_e32 v0, s0 41; VI-NEXT: s_add_u32 s0, s2, s4 42; VI-NEXT: v_mov_b32_e32 v1, s1 43; VI-NEXT: s_addc_u32 s1, s3, s5 44; VI-NEXT: v_mov_b32_e32 v3, s1 45; VI-NEXT: v_mov_b32_e32 v2, s0 46; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 47; VI-NEXT: s_endpgm 48; 49; GFX9-LABEL: sadd64rr: 50; GFX9: ; %bb.0: ; %entry 51; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 52; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 53; GFX9-NEXT: v_mov_b32_e32 v2, 0 54; GFX9-NEXT: s_waitcnt lgkmcnt(0) 55; GFX9-NEXT: s_add_u32 s2, s2, s6 56; GFX9-NEXT: s_addc_u32 s3, s3, s7 57; GFX9-NEXT: v_mov_b32_e32 v0, s2 58; GFX9-NEXT: v_mov_b32_e32 v1, s3 59; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 60; GFX9-NEXT: s_endpgm 61; 62; GFX1010-LABEL: sadd64rr: 63; GFX1010: ; %bb.0: ; %entry 64; GFX1010-NEXT: s_clause 0x1 65; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 66; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 67; GFX1010-NEXT: v_mov_b32_e32 v2, 0 68; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 69; GFX1010-NEXT: s_add_u32 s2, s2, s6 70; GFX1010-NEXT: s_addc_u32 s3, s3, s7 71; GFX1010-NEXT: v_mov_b32_e32 v0, s2 72; GFX1010-NEXT: v_mov_b32_e32 v1, s3 73; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 74; GFX1010-NEXT: s_endpgm 75; 76; GFX1030W32-LABEL: sadd64rr: 77; GFX1030W32: ; %bb.0: ; %entry 78; GFX1030W32-NEXT: s_clause 0x1 79; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 80; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 81; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 82; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 83; GFX1030W32-NEXT: s_add_u32 s2, s2, s4 84; GFX1030W32-NEXT: s_addc_u32 s3, s3, s5 85; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 86; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 87; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 88; GFX1030W32-NEXT: s_endpgm 89; 90; GFX1030W64-LABEL: sadd64rr: 91; GFX1030W64: ; %bb.0: ; %entry 92; GFX1030W64-NEXT: s_clause 0x1 93; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 94; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 95; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 96; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 97; GFX1030W64-NEXT: s_add_u32 s2, s2, s4 98; GFX1030W64-NEXT: s_addc_u32 s3, s3, s5 99; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 100; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 101; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 102; GFX1030W64-NEXT: s_endpgm 103; 104; GFX11-LABEL: sadd64rr: 105; GFX11: ; %bb.0: ; %entry 106; GFX11-NEXT: s_clause 0x1 107; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 108; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 109; GFX11-NEXT: s_waitcnt lgkmcnt(0) 110; GFX11-NEXT: s_add_u32 s2, s2, s4 111; GFX11-NEXT: s_addc_u32 s3, s3, s5 112; GFX11-NEXT: v_mov_b32_e32 v0, s2 113; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 114; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 115; GFX11-NEXT: s_endpgm 116entry: 117 %add = add i64 %a, %b 118 store i64 %add, ptr addrspace(1) %out 119 ret void 120} 121 122; GCN-ISEL-LABEL: name: sadd64ri 123; GCN-ISEL-LABEL: body: 124; GCN-ISEL-LABEL: bb.0.entry: 125; GCN-ISEL: S_ADD_U64_PSEUDO 126 127define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { 128; CISI-LABEL: sadd64ri: 129; CISI: ; %bb.0: ; %entry 130; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 131; CISI-NEXT: s_mov_b32 s7, 0xf000 132; CISI-NEXT: s_mov_b32 s6, -1 133; CISI-NEXT: s_waitcnt lgkmcnt(0) 134; CISI-NEXT: s_mov_b32 s4, s0 135; CISI-NEXT: s_add_u32 s0, s2, 0x56789876 136; CISI-NEXT: s_mov_b32 s5, s1 137; CISI-NEXT: s_addc_u32 s1, s3, 0x1234 138; CISI-NEXT: v_mov_b32_e32 v0, s0 139; CISI-NEXT: v_mov_b32_e32 v1, s1 140; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 141; CISI-NEXT: s_endpgm 142; 143; VI-LABEL: sadd64ri: 144; VI: ; %bb.0: ; %entry 145; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: v_mov_b32_e32 v0, s0 148; VI-NEXT: s_add_u32 s0, s2, 0x56789876 149; VI-NEXT: v_mov_b32_e32 v1, s1 150; VI-NEXT: s_addc_u32 s1, s3, 0x1234 151; VI-NEXT: v_mov_b32_e32 v3, s1 152; VI-NEXT: v_mov_b32_e32 v2, s0 153; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 154; VI-NEXT: s_endpgm 155; 156; GFX9-LABEL: sadd64ri: 157; GFX9: ; %bb.0: ; %entry 158; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 159; GFX9-NEXT: v_mov_b32_e32 v2, 0 160; GFX9-NEXT: s_waitcnt lgkmcnt(0) 161; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876 162; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234 163; GFX9-NEXT: v_mov_b32_e32 v0, s2 164; GFX9-NEXT: v_mov_b32_e32 v1, s3 165; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 166; GFX9-NEXT: s_endpgm 167; 168; GFX1010-LABEL: sadd64ri: 169; GFX1010: ; %bb.0: ; %entry 170; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 171; GFX1010-NEXT: v_mov_b32_e32 v2, 0 172; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 173; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876 174; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234 175; GFX1010-NEXT: v_mov_b32_e32 v0, s2 176; GFX1010-NEXT: v_mov_b32_e32 v1, s3 177; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 178; GFX1010-NEXT: s_endpgm 179; 180; GFX1030W32-LABEL: sadd64ri: 181; GFX1030W32: ; %bb.0: ; %entry 182; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 183; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 184; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 185; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876 186; GFX1030W32-NEXT: s_addc_u32 s3, s3, 0x1234 187; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 188; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 189; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 190; GFX1030W32-NEXT: s_endpgm 191; 192; GFX1030W64-LABEL: sadd64ri: 193; GFX1030W64: ; %bb.0: ; %entry 194; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 195; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 196; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 197; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876 198; GFX1030W64-NEXT: s_addc_u32 s3, s3, 0x1234 199; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 200; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 201; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 202; GFX1030W64-NEXT: s_endpgm 203; 204; GFX11-LABEL: sadd64ri: 205; GFX11: ; %bb.0: ; %entry 206; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 207; GFX11-NEXT: s_waitcnt lgkmcnt(0) 208; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876 209; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234 210; GFX11-NEXT: v_mov_b32_e32 v0, s2 211; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 212; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 213; GFX11-NEXT: s_endpgm 214entry: 215 %add = add i64 20015998343286, %a 216 store i64 %add, ptr addrspace(1) %out 217 ret void 218} 219 220; GCN-ISEL-LABEL: name: vadd64rr 221; GCN-ISEL-LABEL: body: 222; GCN-ISEL-LABEL: bb.0.entry: 223; GCN-ISEL: V_ADD_U64_PSEUDO 224 225define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { 226; CISI-LABEL: vadd64rr: 227; CISI: ; %bb.0: ; %entry 228; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 229; CISI-NEXT: s_mov_b32 s7, 0xf000 230; CISI-NEXT: s_mov_b32 s6, -1 231; CISI-NEXT: s_waitcnt lgkmcnt(0) 232; CISI-NEXT: v_mov_b32_e32 v1, s3 233; CISI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 234; CISI-NEXT: s_mov_b32 s4, s0 235; CISI-NEXT: s_mov_b32 s5, s1 236; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 237; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 238; CISI-NEXT: s_endpgm 239; 240; VI-LABEL: vadd64rr: 241; VI: ; %bb.0: ; %entry 242; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 243; VI-NEXT: s_waitcnt lgkmcnt(0) 244; VI-NEXT: v_mov_b32_e32 v4, s3 245; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 246; VI-NEXT: v_mov_b32_e32 v1, s0 247; VI-NEXT: v_mov_b32_e32 v2, s1 248; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 249; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4] 250; VI-NEXT: s_endpgm 251; 252; GFX9-LABEL: vadd64rr: 253; GFX9: ; %bb.0: ; %entry 254; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 255; GFX9-NEXT: v_mov_b32_e32 v2, 0 256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 257; GFX9-NEXT: v_mov_b32_e32 v1, s3 258; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 259; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 260; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 261; GFX9-NEXT: s_endpgm 262; 263; GFX1010-LABEL: vadd64rr: 264; GFX1010: ; %bb.0: ; %entry 265; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 266; GFX1010-NEXT: v_mov_b32_e32 v2, 0 267; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 268; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0 269; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 270; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 271; GFX1010-NEXT: s_endpgm 272; 273; GFX1030W32-LABEL: vadd64rr: 274; GFX1030W32: ; %bb.0: ; %entry 275; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 276; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 277; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 278; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0 279; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 280; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 281; GFX1030W32-NEXT: s_endpgm 282; 283; GFX1030W64-LABEL: vadd64rr: 284; GFX1030W64: ; %bb.0: ; %entry 285; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 286; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 287; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 288; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0 289; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5] 290; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 291; GFX1030W64-NEXT: s_endpgm 292; 293; GFX11-LABEL: vadd64rr: 294; GFX11: ; %bb.0: ; %entry 295; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 296; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 297; GFX11-NEXT: v_mov_b32_e32 v2, 0 298; GFX11-NEXT: s_waitcnt lgkmcnt(0) 299; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 300; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0 301; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 302; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 303; GFX11-NEXT: s_endpgm 304entry: 305 %tid = call i32 @llvm.amdgcn.workitem.id.x() 306 %tid.ext = sext i32 %tid to i64 307 %add = add i64 %a, %tid.ext 308 store i64 %add, ptr addrspace(1) %out 309 ret void 310} 311 312; GCN-ISEL-LABEL: name: vadd64ri 313; GCN-ISEL-LABEL: body: 314; GCN-ISEL-LABEL: bb.0.entry: 315; GCN-ISEL: V_ADD_U64_PSEUDO 316 317define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { 318; CISI-LABEL: vadd64ri: 319; CISI: ; %bb.0: ; %entry 320; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 321; CISI-NEXT: v_add_i32_e32 v0, vcc, 0x56789876, v0 322; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 323; CISI-NEXT: s_mov_b32 s3, 0xf000 324; CISI-NEXT: s_mov_b32 s2, -1 325; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 326; CISI-NEXT: s_waitcnt lgkmcnt(0) 327; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 328; CISI-NEXT: s_endpgm 329; 330; VI-LABEL: vadd64ri: 331; VI: ; %bb.0: ; %entry 332; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 333; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0 334; VI-NEXT: v_mov_b32_e32 v1, 0x1234 335; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 336; VI-NEXT: s_waitcnt lgkmcnt(0) 337; VI-NEXT: v_mov_b32_e32 v3, s1 338; VI-NEXT: v_mov_b32_e32 v2, s0 339; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 340; VI-NEXT: s_endpgm 341; 342; GFX9-LABEL: vadd64ri: 343; GFX9: ; %bb.0: ; %entry 344; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 345; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0 346; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 347; GFX9-NEXT: v_mov_b32_e32 v2, 0 348; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 349; GFX9-NEXT: s_waitcnt lgkmcnt(0) 350; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 351; GFX9-NEXT: s_endpgm 352; 353; GFX1010-LABEL: vadd64ri: 354; GFX1010: ; %bb.0: ; %entry 355; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 356; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 357; GFX1010-NEXT: v_mov_b32_e32 v2, 0 358; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 359; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 360; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 361; GFX1010-NEXT: s_endpgm 362; 363; GFX1030W32-LABEL: vadd64ri: 364; GFX1030W32: ; %bb.0: ; %entry 365; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 366; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 367; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 368; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2 369; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 370; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 371; GFX1030W32-NEXT: s_endpgm 372; 373; GFX1030W64-LABEL: vadd64ri: 374; GFX1030W64: ; %bb.0: ; %entry 375; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 376; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0 377; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 378; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] 379; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 380; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 381; GFX1030W64-NEXT: s_endpgm 382; 383; GFX11-LABEL: vadd64ri: 384; GFX11: ; %bb.0: ; %entry 385; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 386; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 387; GFX11-NEXT: v_mov_b32_e32 v2, 0 388; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 389; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 390; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2 391; GFX11-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 393; GFX11-NEXT: s_endpgm 394entry: 395 %tid = call i32 @llvm.amdgcn.workitem.id.x() 396 %tid.ext = sext i32 %tid to i64 397 %add = add i64 20015998343286, %tid.ext 398 store i64 %add, ptr addrspace(1) %out 399 ret void 400} 401 402; GCN-ISEL-LABEL: name: suaddo32 403; GCN-ISEL-LABEL: body: 404; GCN-ISEL-LABEL: bb.0 405; GCN-ISEL: S_ADD_I32 406define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 407; CISI-LABEL: suaddo32: 408; CISI: ; %bb.0: 409; CISI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 410; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 411; CISI-NEXT: s_mov_b32 s3, 0xf000 412; CISI-NEXT: s_mov_b32 s2, -1 413; CISI-NEXT: s_waitcnt lgkmcnt(0) 414; CISI-NEXT: s_add_i32 s4, s6, s7 415; CISI-NEXT: v_mov_b32_e32 v0, s4 416; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 417; CISI-NEXT: s_endpgm 418; 419; VI-LABEL: suaddo32: 420; VI: ; %bb.0: 421; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 422; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 423; VI-NEXT: s_waitcnt lgkmcnt(0) 424; VI-NEXT: s_add_i32 s0, s0, s1 425; VI-NEXT: v_mov_b32_e32 v0, s2 426; VI-NEXT: v_mov_b32_e32 v1, s3 427; VI-NEXT: v_mov_b32_e32 v2, s0 428; VI-NEXT: flat_store_dword v[0:1], v2 429; VI-NEXT: s_endpgm 430; 431; GFX9-LABEL: suaddo32: 432; GFX9: ; %bb.0: 433; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 434; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 435; GFX9-NEXT: v_mov_b32_e32 v0, 0 436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 437; GFX9-NEXT: s_add_i32 s0, s0, s1 438; GFX9-NEXT: v_mov_b32_e32 v1, s0 439; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 440; GFX9-NEXT: s_endpgm 441; 442; GFX1010-LABEL: suaddo32: 443; GFX1010: ; %bb.0: 444; GFX1010-NEXT: s_clause 0x1 445; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 446; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 447; GFX1010-NEXT: v_mov_b32_e32 v0, 0 448; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 449; GFX1010-NEXT: s_add_i32 s0, s0, s1 450; GFX1010-NEXT: v_mov_b32_e32 v1, s0 451; GFX1010-NEXT: global_store_dword v0, v1, s[2:3] 452; GFX1010-NEXT: s_endpgm 453; 454; GFX1030W32-LABEL: suaddo32: 455; GFX1030W32: ; %bb.0: 456; GFX1030W32-NEXT: s_clause 0x1 457; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 458; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 459; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 460; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 461; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 462; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 463; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] 464; GFX1030W32-NEXT: s_endpgm 465; 466; GFX1030W64-LABEL: suaddo32: 467; GFX1030W64: ; %bb.0: 468; GFX1030W64-NEXT: s_clause 0x1 469; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 470; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 471; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 472; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 473; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 474; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 475; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] 476; GFX1030W64-NEXT: s_endpgm 477; 478; GFX11-LABEL: suaddo32: 479; GFX11: ; %bb.0: 480; GFX11-NEXT: s_clause 0x1 481; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 482; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 483; GFX11-NEXT: s_waitcnt lgkmcnt(0) 484; GFX11-NEXT: s_add_i32 s0, s0, s1 485; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 486; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 487; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 488; GFX11-NEXT: s_endpgm 489 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 490 %val = extractvalue { i32, i1 } %uadd, 0 491 %carry = extractvalue { i32, i1 } %uadd, 1 492 store i32 %val, ptr addrspace(1) %out, align 4 493 ret void 494} 495 496 497; GCN-ISEL-LABEL: name: uaddo32_vcc_user 498; GCN-ISEL-LABEL: body: 499; GCN-ISEL-LABEL: bb.0 500; GCN-ISEL: V_ADD_CO_U32_e64 501 502; below we check selection to v_add/addc 503; because the only user of VCC produced by the UADDOis v_cndmask. 504; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC 505 506define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 507; CISI-LABEL: uaddo32_vcc_user: 508; CISI: ; %bb.0: 509; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 510; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 511; CISI-NEXT: s_mov_b32 s7, 0xf000 512; CISI-NEXT: s_mov_b32 s6, -1 513; CISI-NEXT: s_waitcnt lgkmcnt(0) 514; CISI-NEXT: s_mov_b32 s4, s0 515; CISI-NEXT: v_mov_b32_e32 v0, s9 516; CISI-NEXT: s_mov_b32 s5, s1 517; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 518; CISI-NEXT: s_mov_b32 s0, s2 519; CISI-NEXT: s_mov_b32 s1, s3 520; CISI-NEXT: s_mov_b32 s2, s6 521; CISI-NEXT: s_mov_b32 s3, s7 522; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 523; CISI-NEXT: buffer_store_dword v0, off, s[4:7], 0 524; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 525; CISI-NEXT: s_endpgm 526; 527; VI-LABEL: uaddo32_vcc_user: 528; VI: ; %bb.0: 529; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 530; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 531; VI-NEXT: s_waitcnt lgkmcnt(0) 532; VI-NEXT: v_mov_b32_e32 v0, s0 533; VI-NEXT: v_mov_b32_e32 v4, s5 534; VI-NEXT: v_mov_b32_e32 v1, s1 535; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 536; VI-NEXT: v_mov_b32_e32 v2, s2 537; VI-NEXT: v_mov_b32_e32 v3, s3 538; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 539; VI-NEXT: flat_store_dword v[0:1], v4 540; VI-NEXT: flat_store_byte v[2:3], v5 541; VI-NEXT: s_endpgm 542; 543; GFX9-LABEL: uaddo32_vcc_user: 544; GFX9: ; %bb.0: 545; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 546; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 547; GFX9-NEXT: v_mov_b32_e32 v0, 0 548; GFX9-NEXT: s_waitcnt lgkmcnt(0) 549; GFX9-NEXT: v_mov_b32_e32 v1, s7 550; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 551; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 552; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 553; GFX9-NEXT: global_store_byte v0, v2, s[2:3] 554; GFX9-NEXT: s_endpgm 555; 556; GFX1010-LABEL: uaddo32_vcc_user: 557; GFX1010: ; %bb.0: 558; GFX1010-NEXT: s_clause 0x1 559; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 560; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 561; GFX1010-NEXT: v_mov_b32_e32 v0, 0 562; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 563; GFX1010-NEXT: v_add_co_u32 v1, s4, s6, s7 564; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 565; GFX1010-NEXT: global_store_dword v0, v1, s[0:1] 566; GFX1010-NEXT: global_store_byte v0, v2, s[2:3] 567; GFX1010-NEXT: s_endpgm 568; 569; GFX1030W32-LABEL: uaddo32_vcc_user: 570; GFX1030W32: ; %bb.0: 571; GFX1030W32-NEXT: s_clause 0x1 572; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 573; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 574; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 575; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 576; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s6, s7 577; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 578; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] 579; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3] 580; GFX1030W32-NEXT: s_endpgm 581; 582; GFX1030W64-LABEL: uaddo32_vcc_user: 583; GFX1030W64: ; %bb.0: 584; GFX1030W64-NEXT: s_clause 0x1 585; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 586; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 587; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 588; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 589; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s6, s7 590; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] 591; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] 592; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3] 593; GFX1030W64-NEXT: s_endpgm 594; 595; GFX11-LABEL: uaddo32_vcc_user: 596; GFX11: ; %bb.0: 597; GFX11-NEXT: s_clause 0x1 598; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 599; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 600; GFX11-NEXT: v_mov_b32_e32 v0, 0 601; GFX11-NEXT: s_waitcnt lgkmcnt(0) 602; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7 603; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 604; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 605; GFX11-NEXT: s_clause 0x1 606; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 607; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] 608; GFX11-NEXT: s_endpgm 609 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 610 %val = extractvalue { i32, i1 } %uadd, 0 611 %carry = extractvalue { i32, i1 } %uadd, 1 612 store i32 %val, ptr addrspace(1) %out, align 4 613 store i1 %carry, ptr addrspace(1) %carryout 614 ret void 615} 616 617; GCN-ISEL-LABEL: name: suaddo64 618; GCN-ISEL-LABEL: body: 619; GCN-ISEL-LABEL: bb.0 620; GCN-ISEL: S_ADD_U64_PSEUDO 621 622define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { 623; CISI-LABEL: suaddo64: 624; CISI: ; %bb.0: 625; CISI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 626; CISI-NEXT: s_mov_b32 s11, 0xf000 627; CISI-NEXT: s_mov_b32 s10, -1 628; CISI-NEXT: s_waitcnt lgkmcnt(0) 629; CISI-NEXT: s_add_u32 s6, s4, s6 630; CISI-NEXT: v_mov_b32_e32 v0, s4 631; CISI-NEXT: s_addc_u32 s7, s5, s7 632; CISI-NEXT: v_mov_b32_e32 v1, s5 633; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] 634; CISI-NEXT: v_mov_b32_e32 v2, s6 635; CISI-NEXT: s_mov_b32 s8, s0 636; CISI-NEXT: s_mov_b32 s9, s1 637; CISI-NEXT: s_mov_b32 s0, s2 638; CISI-NEXT: s_mov_b32 s1, s3 639; CISI-NEXT: s_mov_b32 s2, s10 640; CISI-NEXT: s_mov_b32 s3, s11 641; CISI-NEXT: v_mov_b32_e32 v3, s7 642; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 643; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 644; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 645; CISI-NEXT: s_endpgm 646; 647; VI-LABEL: suaddo64: 648; VI: ; %bb.0: 649; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 650; VI-NEXT: s_waitcnt lgkmcnt(0) 651; VI-NEXT: v_mov_b32_e32 v0, s0 652; VI-NEXT: s_add_u32 s0, s4, s6 653; VI-NEXT: v_mov_b32_e32 v4, s4 654; VI-NEXT: v_mov_b32_e32 v1, s1 655; VI-NEXT: s_addc_u32 s1, s5, s7 656; VI-NEXT: v_mov_b32_e32 v5, s5 657; VI-NEXT: v_mov_b32_e32 v7, s1 658; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] 659; VI-NEXT: v_mov_b32_e32 v6, s0 660; VI-NEXT: v_mov_b32_e32 v2, s2 661; VI-NEXT: v_mov_b32_e32 v3, s3 662; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7] 663; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 664; VI-NEXT: flat_store_byte v[2:3], v0 665; VI-NEXT: s_endpgm 666; 667; GFX9-LABEL: suaddo64: 668; GFX9: ; %bb.0: 669; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 670; GFX9-NEXT: v_mov_b32_e32 v4, 0 671; GFX9-NEXT: s_waitcnt lgkmcnt(0) 672; GFX9-NEXT: s_add_u32 s0, s12, s14 673; GFX9-NEXT: v_mov_b32_e32 v0, s12 674; GFX9-NEXT: v_mov_b32_e32 v1, s13 675; GFX9-NEXT: s_addc_u32 s1, s13, s15 676; GFX9-NEXT: v_mov_b32_e32 v3, s1 677; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 678; GFX9-NEXT: v_mov_b32_e32 v2, s0 679; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 680; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] 681; GFX9-NEXT: global_store_byte v4, v0, s[10:11] 682; GFX9-NEXT: s_endpgm 683; 684; GFX1010-LABEL: suaddo64: 685; GFX1010: ; %bb.0: 686; GFX1010-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 687; GFX1010-NEXT: v_mov_b32_e32 v2, 0 688; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 689; GFX1010-NEXT: s_add_u32 s0, s12, s14 690; GFX1010-NEXT: s_addc_u32 s1, s13, s15 691; GFX1010-NEXT: v_mov_b32_e32 v0, s0 692; GFX1010-NEXT: v_mov_b32_e32 v1, s1 693; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] 694; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 695; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 696; GFX1010-NEXT: global_store_byte v2, v3, s[10:11] 697; GFX1010-NEXT: s_endpgm 698; 699; GFX1030W32-LABEL: suaddo64: 700; GFX1030W32: ; %bb.0: 701; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 702; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 703; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 704; GFX1030W32-NEXT: s_add_u32 s6, s4, s6 705; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7 706; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6 707; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] 708; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7 709; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 710; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 711; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] 712; GFX1030W32-NEXT: s_endpgm 713; 714; GFX1030W64-LABEL: suaddo64: 715; GFX1030W64: ; %bb.0: 716; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 717; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 718; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 719; GFX1030W64-NEXT: s_add_u32 s6, s4, s6 720; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7 721; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 722; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5] 723; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7 724; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] 725; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 726; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] 727; GFX1030W64-NEXT: s_endpgm 728; 729; GFX11-LABEL: suaddo64: 730; GFX11: ; %bb.0: 731; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 732; GFX11-NEXT: s_waitcnt lgkmcnt(0) 733; GFX11-NEXT: s_add_u32 s6, s4, s6 734; GFX11-NEXT: s_addc_u32 s7, s5, s7 735; GFX11-NEXT: v_mov_b32_e32 v0, s6 736; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] 737; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 738; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 739; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 740; GFX11-NEXT: s_clause 0x1 741; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 742; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] 743; GFX11-NEXT: s_endpgm 744 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 745 %val = extractvalue { i64, i1 } %uadd, 0 746 %carry = extractvalue { i64, i1 } %uadd, 1 747 store i64 %val, ptr addrspace(1) %out, align 8 748 store i1 %carry, ptr addrspace(1) %carryout 749 ret void 750} 751 752; GCN-ISEL-LABEL: name: vuaddo64 753; GCN-ISEL-LABEL: body: 754; GCN-ISEL-LABEL: bb.0 755; GCN-ISEL: V_ADD_U64_PSEUDO 756 757define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { 758; CISI-LABEL: vuaddo64: 759; CISI: ; %bb.0: 760; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 761; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 762; CISI-NEXT: s_mov_b32 s7, 0xf000 763; CISI-NEXT: s_mov_b32 s6, -1 764; CISI-NEXT: s_waitcnt lgkmcnt(0) 765; CISI-NEXT: s_mov_b32 s4, s0 766; CISI-NEXT: v_mov_b32_e32 v1, s9 767; CISI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 768; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 769; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[8:9], v[0:1] 770; CISI-NEXT: s_mov_b32 s5, s1 771; CISI-NEXT: s_mov_b32 s0, s2 772; CISI-NEXT: s_mov_b32 s1, s3 773; CISI-NEXT: s_mov_b32 s2, s6 774; CISI-NEXT: s_mov_b32 s3, s7 775; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 776; CISI-NEXT: s_waitcnt expcnt(0) 777; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 778; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 779; CISI-NEXT: s_endpgm 780; 781; VI-LABEL: vuaddo64: 782; VI: ; %bb.0: 783; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 784; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 785; VI-NEXT: s_waitcnt lgkmcnt(0) 786; VI-NEXT: v_mov_b32_e32 v1, s0 787; VI-NEXT: v_mov_b32_e32 v6, s5 788; VI-NEXT: v_add_u32_e32 v5, vcc, s4, v0 789; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 790; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6] 791; VI-NEXT: v_mov_b32_e32 v2, s1 792; VI-NEXT: v_mov_b32_e32 v3, s2 793; VI-NEXT: v_mov_b32_e32 v4, s3 794; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 795; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] 796; VI-NEXT: flat_store_byte v[3:4], v0 797; VI-NEXT: s_endpgm 798; 799; GFX9-LABEL: vuaddo64: 800; GFX9: ; %bb.0: 801; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 802; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 803; GFX9-NEXT: v_mov_b32_e32 v2, 0 804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 805; GFX9-NEXT: v_mov_b32_e32 v1, s7 806; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 807; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 808; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] 809; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 810; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 811; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 812; GFX9-NEXT: s_endpgm 813; 814; GFX1010-LABEL: vuaddo64: 815; GFX1010: ; %bb.0: 816; GFX1010-NEXT: s_clause 0x1 817; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 818; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 819; GFX1010-NEXT: v_mov_b32_e32 v2, 0 820; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 821; GFX1010-NEXT: v_add_co_u32 v0, s4, s6, v0 822; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 823; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] 824; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 825; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 826; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] 827; GFX1010-NEXT: s_endpgm 828; 829; GFX1030W32-LABEL: vuaddo64: 830; GFX1030W32: ; %bb.0: 831; GFX1030W32-NEXT: s_clause 0x1 832; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 833; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 834; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 835; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 836; GFX1030W32-NEXT: v_add_co_u32 v0, s4, s6, v0 837; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 838; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] 839; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 840; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 841; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] 842; GFX1030W32-NEXT: s_endpgm 843; 844; GFX1030W64-LABEL: vuaddo64: 845; GFX1030W64: ; %bb.0: 846; GFX1030W64-NEXT: s_clause 0x1 847; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 848; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 849; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 850; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 851; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s6, v0 852; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[4:5] 853; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] 854; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 855; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 856; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] 857; GFX1030W64-NEXT: s_endpgm 858; 859; GFX11-LABEL: vuaddo64: 860; GFX11: ; %bb.0: 861; GFX11-NEXT: s_clause 0x1 862; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 863; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 864; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 865; GFX11-NEXT: v_mov_b32_e32 v2, 0 866; GFX11-NEXT: s_waitcnt lgkmcnt(0) 867; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 868; GFX11-NEXT: v_add_co_u32 v0, s4, s6, v0 869; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s4 870; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 871; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[6:7], v[0:1] 872; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 873; GFX11-NEXT: s_clause 0x1 874; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 875; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] 876; GFX11-NEXT: s_endpgm 877 %tid = call i32 @llvm.amdgcn.workitem.id.x() 878 %tid.ext = sext i32 %tid to i64 879 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %tid.ext) 880 %val = extractvalue { i64, i1 } %uadd, 0 881 %carry = extractvalue { i64, i1 } %uadd, 1 882 store i64 %val, ptr addrspace(1) %out, align 8 883 store i1 %carry, ptr addrspace(1) %carryout 884 ret void 885} 886 887; GCN-ISEL-LABEL: name: ssub64rr 888; GCN-ISEL-LABEL: body: 889; GCN-ISEL-LABEL: bb.0.entry: 890; GCN-ISEL: S_SUB_U64_PSEUDO 891 892define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { 893; CISI-LABEL: ssub64rr: 894; CISI: ; %bb.0: ; %entry 895; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 896; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 897; CISI-NEXT: s_mov_b32 s7, 0xf000 898; CISI-NEXT: s_mov_b32 s6, -1 899; CISI-NEXT: s_waitcnt lgkmcnt(0) 900; CISI-NEXT: s_mov_b32 s4, s0 901; CISI-NEXT: s_sub_u32 s0, s2, s8 902; CISI-NEXT: s_mov_b32 s5, s1 903; CISI-NEXT: s_subb_u32 s1, s3, s9 904; CISI-NEXT: v_mov_b32_e32 v0, s0 905; CISI-NEXT: v_mov_b32_e32 v1, s1 906; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 907; CISI-NEXT: s_endpgm 908; 909; VI-LABEL: ssub64rr: 910; VI: ; %bb.0: ; %entry 911; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 912; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 913; VI-NEXT: s_waitcnt lgkmcnt(0) 914; VI-NEXT: v_mov_b32_e32 v0, s0 915; VI-NEXT: s_sub_u32 s0, s2, s4 916; VI-NEXT: v_mov_b32_e32 v1, s1 917; VI-NEXT: s_subb_u32 s1, s3, s5 918; VI-NEXT: v_mov_b32_e32 v3, s1 919; VI-NEXT: v_mov_b32_e32 v2, s0 920; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 921; VI-NEXT: s_endpgm 922; 923; GFX9-LABEL: ssub64rr: 924; GFX9: ; %bb.0: ; %entry 925; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 926; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 927; GFX9-NEXT: v_mov_b32_e32 v2, 0 928; GFX9-NEXT: s_waitcnt lgkmcnt(0) 929; GFX9-NEXT: s_sub_u32 s2, s2, s6 930; GFX9-NEXT: s_subb_u32 s3, s3, s7 931; GFX9-NEXT: v_mov_b32_e32 v0, s2 932; GFX9-NEXT: v_mov_b32_e32 v1, s3 933; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 934; GFX9-NEXT: s_endpgm 935; 936; GFX1010-LABEL: ssub64rr: 937; GFX1010: ; %bb.0: ; %entry 938; GFX1010-NEXT: s_clause 0x1 939; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 940; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 941; GFX1010-NEXT: v_mov_b32_e32 v2, 0 942; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 943; GFX1010-NEXT: s_sub_u32 s2, s2, s6 944; GFX1010-NEXT: s_subb_u32 s3, s3, s7 945; GFX1010-NEXT: v_mov_b32_e32 v0, s2 946; GFX1010-NEXT: v_mov_b32_e32 v1, s3 947; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 948; GFX1010-NEXT: s_endpgm 949; 950; GFX1030W32-LABEL: ssub64rr: 951; GFX1030W32: ; %bb.0: ; %entry 952; GFX1030W32-NEXT: s_clause 0x1 953; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 954; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 955; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 956; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 957; GFX1030W32-NEXT: s_sub_u32 s2, s2, s4 958; GFX1030W32-NEXT: s_subb_u32 s3, s3, s5 959; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 960; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 961; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 962; GFX1030W32-NEXT: s_endpgm 963; 964; GFX1030W64-LABEL: ssub64rr: 965; GFX1030W64: ; %bb.0: ; %entry 966; GFX1030W64-NEXT: s_clause 0x1 967; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 968; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 969; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 970; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 971; GFX1030W64-NEXT: s_sub_u32 s2, s2, s4 972; GFX1030W64-NEXT: s_subb_u32 s3, s3, s5 973; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 974; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 975; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 976; GFX1030W64-NEXT: s_endpgm 977; 978; GFX11-LABEL: ssub64rr: 979; GFX11: ; %bb.0: ; %entry 980; GFX11-NEXT: s_clause 0x1 981; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 982; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 983; GFX11-NEXT: s_waitcnt lgkmcnt(0) 984; GFX11-NEXT: s_sub_u32 s2, s2, s4 985; GFX11-NEXT: s_subb_u32 s3, s3, s5 986; GFX11-NEXT: v_mov_b32_e32 v0, s2 987; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 988; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 989; GFX11-NEXT: s_endpgm 990entry: 991 %sub = sub i64 %a, %b 992 store i64 %sub, ptr addrspace(1) %out 993 ret void 994} 995 996; GCN-ISEL-LABEL: name: ssub64ri 997; GCN-ISEL-LABEL: body: 998; GCN-ISEL-LABEL: bb.0.entry: 999; GCN-ISEL: S_SUB_U64_PSEUDO 1000 1001define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { 1002; CISI-LABEL: ssub64ri: 1003; CISI: ; %bb.0: ; %entry 1004; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1005; CISI-NEXT: s_mov_b32 s7, 0xf000 1006; CISI-NEXT: s_mov_b32 s6, -1 1007; CISI-NEXT: s_waitcnt lgkmcnt(0) 1008; CISI-NEXT: s_mov_b32 s4, s0 1009; CISI-NEXT: s_sub_u32 s0, 0x56789876, s2 1010; CISI-NEXT: s_mov_b32 s5, s1 1011; CISI-NEXT: s_subb_u32 s1, 0x1234, s3 1012; CISI-NEXT: v_mov_b32_e32 v0, s0 1013; CISI-NEXT: v_mov_b32_e32 v1, s1 1014; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1015; CISI-NEXT: s_endpgm 1016; 1017; VI-LABEL: ssub64ri: 1018; VI: ; %bb.0: ; %entry 1019; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1020; VI-NEXT: s_waitcnt lgkmcnt(0) 1021; VI-NEXT: v_mov_b32_e32 v0, s0 1022; VI-NEXT: s_sub_u32 s0, 0x56789876, s2 1023; VI-NEXT: v_mov_b32_e32 v1, s1 1024; VI-NEXT: s_subb_u32 s1, 0x1234, s3 1025; VI-NEXT: v_mov_b32_e32 v3, s1 1026; VI-NEXT: v_mov_b32_e32 v2, s0 1027; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1028; VI-NEXT: s_endpgm 1029; 1030; GFX9-LABEL: ssub64ri: 1031; GFX9: ; %bb.0: ; %entry 1032; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1033; GFX9-NEXT: v_mov_b32_e32 v2, 0 1034; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2 1036; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3 1037; GFX9-NEXT: v_mov_b32_e32 v0, s2 1038; GFX9-NEXT: v_mov_b32_e32 v1, s3 1039; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1040; GFX9-NEXT: s_endpgm 1041; 1042; GFX1010-LABEL: ssub64ri: 1043; GFX1010: ; %bb.0: ; %entry 1044; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1045; GFX1010-NEXT: v_mov_b32_e32 v2, 0 1046; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1047; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2 1048; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3 1049; GFX1010-NEXT: v_mov_b32_e32 v0, s2 1050; GFX1010-NEXT: v_mov_b32_e32 v1, s3 1051; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1052; GFX1010-NEXT: s_endpgm 1053; 1054; GFX1030W32-LABEL: ssub64ri: 1055; GFX1030W32: ; %bb.0: ; %entry 1056; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1057; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 1058; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1059; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2 1060; GFX1030W32-NEXT: s_subb_u32 s3, 0x1234, s3 1061; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2 1062; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3 1063; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1064; GFX1030W32-NEXT: s_endpgm 1065; 1066; GFX1030W64-LABEL: ssub64ri: 1067; GFX1030W64: ; %bb.0: ; %entry 1068; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1069; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 1070; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2 1072; GFX1030W64-NEXT: s_subb_u32 s3, 0x1234, s3 1073; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2 1074; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3 1075; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1076; GFX1030W64-NEXT: s_endpgm 1077; 1078; GFX11-LABEL: ssub64ri: 1079; GFX11: ; %bb.0: ; %entry 1080; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1081; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1082; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2 1083; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3 1084; GFX11-NEXT: v_mov_b32_e32 v0, s2 1085; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 1086; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1087; GFX11-NEXT: s_endpgm 1088entry: 1089 %sub = sub i64 20015998343286, %a 1090 store i64 %sub, ptr addrspace(1) %out 1091 ret void 1092} 1093 1094; GCN-ISEL-LABEL: name: vsub64rr 1095; GCN-ISEL-LABEL: body: 1096; GCN-ISEL-LABEL: bb.0.entry: 1097; GCN-ISEL: V_SUB_U64_PSEUDO 1098 1099define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { 1100; CISI-LABEL: vsub64rr: 1101; CISI: ; %bb.0: ; %entry 1102; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1103; CISI-NEXT: s_mov_b32 s7, 0xf000 1104; CISI-NEXT: s_mov_b32 s6, -1 1105; CISI-NEXT: s_waitcnt lgkmcnt(0) 1106; CISI-NEXT: v_mov_b32_e32 v1, s3 1107; CISI-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 1108; CISI-NEXT: s_mov_b32 s4, s0 1109; CISI-NEXT: s_mov_b32 s5, s1 1110; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc 1111; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1112; CISI-NEXT: s_endpgm 1113; 1114; VI-LABEL: vsub64rr: 1115; VI: ; %bb.0: ; %entry 1116; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1117; VI-NEXT: s_waitcnt lgkmcnt(0) 1118; VI-NEXT: v_mov_b32_e32 v4, s3 1119; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0 1120; VI-NEXT: v_mov_b32_e32 v1, s0 1121; VI-NEXT: v_mov_b32_e32 v2, s1 1122; VI-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc 1123; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4] 1124; VI-NEXT: s_endpgm 1125; 1126; GFX9-LABEL: vsub64rr: 1127; GFX9: ; %bb.0: ; %entry 1128; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1129; GFX9-NEXT: v_mov_b32_e32 v2, 0 1130; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1131; GFX9-NEXT: v_mov_b32_e32 v1, s3 1132; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 1133; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc 1134; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1135; GFX9-NEXT: s_endpgm 1136; 1137; GFX1010-LABEL: vsub64rr: 1138; GFX1010: ; %bb.0: ; %entry 1139; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1140; GFX1010-NEXT: v_mov_b32_e32 v2, 0 1141; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1142; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0 1143; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2 1144; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1145; GFX1010-NEXT: s_endpgm 1146; 1147; GFX1030W32-LABEL: vsub64rr: 1148; GFX1030W32: ; %bb.0: ; %entry 1149; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1150; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 1151; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0 1153; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 1154; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1155; GFX1030W32-NEXT: s_endpgm 1156; 1157; GFX1030W64-LABEL: vsub64rr: 1158; GFX1030W64: ; %bb.0: ; %entry 1159; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1160; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 1161; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1162; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0 1163; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5] 1164; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1165; GFX1030W64-NEXT: s_endpgm 1166; 1167; GFX11-LABEL: vsub64rr: 1168; GFX11: ; %bb.0: ; %entry 1169; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1170; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1171; GFX11-NEXT: v_mov_b32_e32 v2, 0 1172; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1173; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1174; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0 1175; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 1176; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1177; GFX11-NEXT: s_endpgm 1178entry: 1179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1180 %tid.ext = sext i32 %tid to i64 1181 %sub = sub i64 %a, %tid.ext 1182 store i64 %sub, ptr addrspace(1) %out 1183 ret void 1184} 1185 1186; GCN-ISEL-LABEL: name: vsub64ri 1187; GCN-ISEL-LABEL: body: 1188; GCN-ISEL-LABEL: bb.0.entry: 1189; GCN-ISEL: V_SUB_U64_PSEUDO 1190 1191define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { 1192; CISI-LABEL: vsub64ri: 1193; CISI: ; %bb.0: ; %entry 1194; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1195; CISI-NEXT: v_sub_i32_e32 v0, vcc, 0x56789876, v0 1196; CISI-NEXT: v_mov_b32_e32 v1, 0x1234 1197; CISI-NEXT: s_mov_b32 s3, 0xf000 1198; CISI-NEXT: s_mov_b32 s2, -1 1199; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc 1200; CISI-NEXT: s_waitcnt lgkmcnt(0) 1201; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1202; CISI-NEXT: s_endpgm 1203; 1204; VI-LABEL: vsub64ri: 1205; VI: ; %bb.0: ; %entry 1206; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1207; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0 1208; VI-NEXT: v_mov_b32_e32 v1, 0x1234 1209; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc 1210; VI-NEXT: s_waitcnt lgkmcnt(0) 1211; VI-NEXT: v_mov_b32_e32 v3, s1 1212; VI-NEXT: v_mov_b32_e32 v2, s0 1213; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1214; VI-NEXT: s_endpgm 1215; 1216; GFX9-LABEL: vsub64ri: 1217; GFX9: ; %bb.0: ; %entry 1218; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1219; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0 1220; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234 1221; GFX9-NEXT: v_mov_b32_e32 v2, 0 1222; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc 1223; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1224; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1225; GFX9-NEXT: s_endpgm 1226; 1227; GFX1010-LABEL: vsub64ri: 1228; GFX1010: ; %bb.0: ; %entry 1229; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1230; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 1231; GFX1010-NEXT: v_mov_b32_e32 v2, 0 1232; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2 1233; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1234; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1235; GFX1010-NEXT: s_endpgm 1236; 1237; GFX1030W32-LABEL: vsub64ri: 1238; GFX1030W32: ; %bb.0: ; %entry 1239; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1240; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 1241; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 1242; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 1243; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1244; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1245; GFX1030W32-NEXT: s_endpgm 1246; 1247; GFX1030W64-LABEL: vsub64ri: 1248; GFX1030W64: ; %bb.0: ; %entry 1249; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1250; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0 1251; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 1252; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3] 1253; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1254; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1255; GFX1030W64-NEXT: s_endpgm 1256; 1257; GFX11-LABEL: vsub64ri: 1258; GFX11: ; %bb.0: ; %entry 1259; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1260; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1261; GFX11-NEXT: v_mov_b32_e32 v2, 0 1262; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1263; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 1264; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 1265; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1267; GFX11-NEXT: s_endpgm 1268entry: 1269 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1270 %tid.ext = sext i32 %tid to i64 1271 %sub = sub i64 20015998343286, %tid.ext 1272 store i64 %sub, ptr addrspace(1) %out 1273 ret void 1274} 1275 1276; GCN-ISEL-LABEL: name: susubo32 1277; GCN-ISEL-LABEL: body: 1278; GCN-ISEL-LABEL: bb.0 1279; GCN-ISEL: S_SUB_I32 1280 1281define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 1282; CISI-LABEL: susubo32: 1283; CISI: ; %bb.0: 1284; CISI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 1285; CISI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1286; CISI-NEXT: s_mov_b32 s3, 0xf000 1287; CISI-NEXT: s_mov_b32 s2, -1 1288; CISI-NEXT: s_waitcnt lgkmcnt(0) 1289; CISI-NEXT: s_sub_i32 s4, s6, s7 1290; CISI-NEXT: v_mov_b32_e32 v0, s4 1291; CISI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1292; CISI-NEXT: s_endpgm 1293; 1294; VI-LABEL: susubo32: 1295; VI: ; %bb.0: 1296; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1297; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1298; VI-NEXT: s_waitcnt lgkmcnt(0) 1299; VI-NEXT: s_sub_i32 s0, s0, s1 1300; VI-NEXT: v_mov_b32_e32 v0, s2 1301; VI-NEXT: v_mov_b32_e32 v1, s3 1302; VI-NEXT: v_mov_b32_e32 v2, s0 1303; VI-NEXT: flat_store_dword v[0:1], v2 1304; VI-NEXT: s_endpgm 1305; 1306; GFX9-LABEL: susubo32: 1307; GFX9: ; %bb.0: 1308; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1309; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1310; GFX9-NEXT: v_mov_b32_e32 v0, 0 1311; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1312; GFX9-NEXT: s_sub_i32 s0, s0, s1 1313; GFX9-NEXT: v_mov_b32_e32 v1, s0 1314; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 1315; GFX9-NEXT: s_endpgm 1316; 1317; GFX1010-LABEL: susubo32: 1318; GFX1010: ; %bb.0: 1319; GFX1010-NEXT: s_clause 0x1 1320; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1321; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1322; GFX1010-NEXT: v_mov_b32_e32 v0, 0 1323; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1324; GFX1010-NEXT: s_sub_i32 s0, s0, s1 1325; GFX1010-NEXT: v_mov_b32_e32 v1, s0 1326; GFX1010-NEXT: global_store_dword v0, v1, s[2:3] 1327; GFX1010-NEXT: s_endpgm 1328; 1329; GFX1030W32-LABEL: susubo32: 1330; GFX1030W32: ; %bb.0: 1331; GFX1030W32-NEXT: s_clause 0x1 1332; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1333; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1334; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 1335; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX1030W32-NEXT: s_sub_i32 s0, s0, s1 1337; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0 1338; GFX1030W32-NEXT: global_store_dword v0, v1, s[2:3] 1339; GFX1030W32-NEXT: s_endpgm 1340; 1341; GFX1030W64-LABEL: susubo32: 1342; GFX1030W64: ; %bb.0: 1343; GFX1030W64-NEXT: s_clause 0x1 1344; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1345; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1346; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 1347; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1348; GFX1030W64-NEXT: s_sub_i32 s0, s0, s1 1349; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0 1350; GFX1030W64-NEXT: global_store_dword v0, v1, s[2:3] 1351; GFX1030W64-NEXT: s_endpgm 1352; 1353; GFX11-LABEL: susubo32: 1354; GFX11: ; %bb.0: 1355; GFX11-NEXT: s_clause 0x1 1356; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 1357; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 1358; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1359; GFX11-NEXT: s_sub_i32 s0, s0, s1 1360; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1361; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 1362; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 1363; GFX11-NEXT: s_endpgm 1364 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) 1365 %val = extractvalue { i32, i1 } %usub, 0 1366 %carry = extractvalue { i32, i1 } %usub, 1 1367 store i32 %val, ptr addrspace(1) %out, align 4 1368 ret void 1369} 1370 1371 1372; GCN-ISEL-LABEL: name: usubo32_vcc_user 1373; GCN-ISEL-LABEL: body: 1374; GCN-ISEL-LABEL: bb.0 1375; GCN-ISEL: V_SUB_CO_U32_e64 1376 1377; below we check selection to v_sub/subb 1378; because the only user of VCC produced by the USUBOis v_cndmask. 1379; We select to VALU form to avoid unnecessary s_cselect to copy SCC to VCC 1380 1381define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 1382; CISI-LABEL: usubo32_vcc_user: 1383; CISI: ; %bb.0: 1384; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1385; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1386; CISI-NEXT: s_mov_b32 s7, 0xf000 1387; CISI-NEXT: s_mov_b32 s6, -1 1388; CISI-NEXT: s_waitcnt lgkmcnt(0) 1389; CISI-NEXT: s_mov_b32 s4, s0 1390; CISI-NEXT: v_mov_b32_e32 v0, s9 1391; CISI-NEXT: s_mov_b32 s5, s1 1392; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 1393; CISI-NEXT: s_mov_b32 s0, s2 1394; CISI-NEXT: s_mov_b32 s1, s3 1395; CISI-NEXT: s_mov_b32 s2, s6 1396; CISI-NEXT: s_mov_b32 s3, s7 1397; CISI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 1398; CISI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1399; CISI-NEXT: buffer_store_byte v1, off, s[0:3], 0 1400; CISI-NEXT: s_endpgm 1401; 1402; VI-LABEL: usubo32_vcc_user: 1403; VI: ; %bb.0: 1404; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1405; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1406; VI-NEXT: s_waitcnt lgkmcnt(0) 1407; VI-NEXT: v_mov_b32_e32 v0, s0 1408; VI-NEXT: v_mov_b32_e32 v4, s5 1409; VI-NEXT: v_mov_b32_e32 v1, s1 1410; VI-NEXT: v_sub_u32_e32 v4, vcc, s4, v4 1411; VI-NEXT: v_mov_b32_e32 v2, s2 1412; VI-NEXT: v_mov_b32_e32 v3, s3 1413; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 1414; VI-NEXT: flat_store_dword v[0:1], v4 1415; VI-NEXT: flat_store_byte v[2:3], v5 1416; VI-NEXT: s_endpgm 1417; 1418; GFX9-LABEL: usubo32_vcc_user: 1419; GFX9: ; %bb.0: 1420; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1421; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1422; GFX9-NEXT: v_mov_b32_e32 v0, 0 1423; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1424; GFX9-NEXT: v_mov_b32_e32 v1, s7 1425; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 1426; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1427; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 1428; GFX9-NEXT: global_store_byte v0, v2, s[2:3] 1429; GFX9-NEXT: s_endpgm 1430; 1431; GFX1010-LABEL: usubo32_vcc_user: 1432; GFX1010: ; %bb.0: 1433; GFX1010-NEXT: s_clause 0x1 1434; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1435; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1436; GFX1010-NEXT: v_mov_b32_e32 v0, 0 1437; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1438; GFX1010-NEXT: v_sub_co_u32 v1, s4, s6, s7 1439; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 1440; GFX1010-NEXT: global_store_dword v0, v1, s[0:1] 1441; GFX1010-NEXT: global_store_byte v0, v2, s[2:3] 1442; GFX1010-NEXT: s_endpgm 1443; 1444; GFX1030W32-LABEL: usubo32_vcc_user: 1445; GFX1030W32: ; %bb.0: 1446; GFX1030W32-NEXT: s_clause 0x1 1447; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1448; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1449; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0 1450; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1451; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s6, s7 1452; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 1453; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1] 1454; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3] 1455; GFX1030W32-NEXT: s_endpgm 1456; 1457; GFX1030W64-LABEL: usubo32_vcc_user: 1458; GFX1030W64: ; %bb.0: 1459; GFX1030W64-NEXT: s_clause 0x1 1460; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1461; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1462; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0 1463; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1464; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s6, s7 1465; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] 1466; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1] 1467; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3] 1468; GFX1030W64-NEXT: s_endpgm 1469; 1470; GFX11-LABEL: usubo32_vcc_user: 1471; GFX11: ; %bb.0: 1472; GFX11-NEXT: s_clause 0x1 1473; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1474; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1475; GFX11-NEXT: v_mov_b32_e32 v0, 0 1476; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1477; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7 1478; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1479; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 1480; GFX11-NEXT: s_clause 0x1 1481; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1482; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] 1483; GFX11-NEXT: s_endpgm 1484 %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) 1485 %val = extractvalue { i32, i1 } %usub, 0 1486 %carry = extractvalue { i32, i1 } %usub, 1 1487 store i32 %val, ptr addrspace(1) %out, align 4 1488 store i1 %carry, ptr addrspace(1) %carryout 1489 ret void 1490} 1491 1492; GCN-ISEL-LABEL: name: susubo64 1493; GCN-ISEL-LABEL: body: 1494; GCN-ISEL-LABEL: bb.0 1495; GCN-ISEL: S_SUB_U64_PSEUDO 1496 1497define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { 1498; CISI-LABEL: susubo64: 1499; CISI: ; %bb.0: 1500; CISI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1501; CISI-NEXT: s_mov_b32 s11, 0xf000 1502; CISI-NEXT: s_mov_b32 s10, -1 1503; CISI-NEXT: s_waitcnt lgkmcnt(0) 1504; CISI-NEXT: s_sub_u32 s6, s4, s6 1505; CISI-NEXT: v_mov_b32_e32 v0, s4 1506; CISI-NEXT: s_subb_u32 s7, s5, s7 1507; CISI-NEXT: v_mov_b32_e32 v1, s5 1508; CISI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] 1509; CISI-NEXT: v_mov_b32_e32 v2, s6 1510; CISI-NEXT: s_mov_b32 s8, s0 1511; CISI-NEXT: s_mov_b32 s9, s1 1512; CISI-NEXT: s_mov_b32 s0, s2 1513; CISI-NEXT: s_mov_b32 s1, s3 1514; CISI-NEXT: s_mov_b32 s2, s10 1515; CISI-NEXT: s_mov_b32 s3, s11 1516; CISI-NEXT: v_mov_b32_e32 v3, s7 1517; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1518; CISI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 1519; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1520; CISI-NEXT: s_endpgm 1521; 1522; VI-LABEL: susubo64: 1523; VI: ; %bb.0: 1524; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1525; VI-NEXT: s_waitcnt lgkmcnt(0) 1526; VI-NEXT: v_mov_b32_e32 v0, s0 1527; VI-NEXT: s_sub_u32 s0, s4, s6 1528; VI-NEXT: v_mov_b32_e32 v4, s4 1529; VI-NEXT: v_mov_b32_e32 v1, s1 1530; VI-NEXT: s_subb_u32 s1, s5, s7 1531; VI-NEXT: v_mov_b32_e32 v5, s5 1532; VI-NEXT: v_mov_b32_e32 v7, s1 1533; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5] 1534; VI-NEXT: v_mov_b32_e32 v6, s0 1535; VI-NEXT: v_mov_b32_e32 v2, s2 1536; VI-NEXT: v_mov_b32_e32 v3, s3 1537; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7] 1538; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1539; VI-NEXT: flat_store_byte v[2:3], v0 1540; VI-NEXT: s_endpgm 1541; 1542; GFX9-LABEL: susubo64: 1543; GFX9: ; %bb.0: 1544; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1545; GFX9-NEXT: v_mov_b32_e32 v4, 0 1546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1547; GFX9-NEXT: s_sub_u32 s0, s12, s14 1548; GFX9-NEXT: v_mov_b32_e32 v0, s12 1549; GFX9-NEXT: v_mov_b32_e32 v1, s13 1550; GFX9-NEXT: s_subb_u32 s1, s13, s15 1551; GFX9-NEXT: v_mov_b32_e32 v3, s1 1552; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] 1553; GFX9-NEXT: v_mov_b32_e32 v2, s0 1554; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1555; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] 1556; GFX9-NEXT: global_store_byte v4, v0, s[10:11] 1557; GFX9-NEXT: s_endpgm 1558; 1559; GFX1010-LABEL: susubo64: 1560; GFX1010: ; %bb.0: 1561; GFX1010-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1562; GFX1010-NEXT: v_mov_b32_e32 v2, 0 1563; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1564; GFX1010-NEXT: s_sub_u32 s0, s12, s14 1565; GFX1010-NEXT: s_subb_u32 s1, s13, s15 1566; GFX1010-NEXT: v_mov_b32_e32 v0, s0 1567; GFX1010-NEXT: v_mov_b32_e32 v1, s1 1568; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] 1569; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 1570; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 1571; GFX1010-NEXT: global_store_byte v2, v3, s[10:11] 1572; GFX1010-NEXT: s_endpgm 1573; 1574; GFX1030W32-LABEL: susubo64: 1575; GFX1030W32: ; %bb.0: 1576; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1577; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 1578; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1579; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6 1580; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7 1581; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6 1582; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] 1583; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7 1584; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 1585; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1586; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] 1587; GFX1030W32-NEXT: s_endpgm 1588; 1589; GFX1030W64-LABEL: susubo64: 1590; GFX1030W64: ; %bb.0: 1591; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1592; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 1593; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1594; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6 1595; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7 1596; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 1597; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5] 1598; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7 1599; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] 1600; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1601; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] 1602; GFX1030W64-NEXT: s_endpgm 1603; 1604; GFX11-LABEL: susubo64: 1605; GFX11: ; %bb.0: 1606; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1607; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1608; GFX11-NEXT: s_sub_u32 s6, s4, s6 1609; GFX11-NEXT: s_subb_u32 s7, s5, s7 1610; GFX11-NEXT: v_mov_b32_e32 v0, s6 1611; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] 1612; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 1613; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1614; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 1615; GFX11-NEXT: s_clause 0x1 1616; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1617; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] 1618; GFX11-NEXT: s_endpgm 1619 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) 1620 %val = extractvalue { i64, i1 } %usub, 0 1621 %carry = extractvalue { i64, i1 } %usub, 1 1622 store i64 %val, ptr addrspace(1) %out, align 8 1623 store i1 %carry, ptr addrspace(1) %carryout 1624 ret void 1625} 1626 1627; GCN-ISEL-LABEL: name: vusubo64 1628; GCN-ISEL-LABEL: body: 1629; GCN-ISEL-LABEL: bb.0 1630; GCN-ISEL: V_SUB_U64_PSEUDO 1631 1632define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a) #0 { 1633; CISI-LABEL: vusubo64: 1634; CISI: ; %bb.0: 1635; CISI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1636; CISI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1637; CISI-NEXT: s_mov_b32 s7, 0xf000 1638; CISI-NEXT: s_mov_b32 s6, -1 1639; CISI-NEXT: s_waitcnt lgkmcnt(0) 1640; CISI-NEXT: s_mov_b32 s4, s0 1641; CISI-NEXT: v_mov_b32_e32 v1, s9 1642; CISI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 1643; CISI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc 1644; CISI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] 1645; CISI-NEXT: s_mov_b32 s5, s1 1646; CISI-NEXT: s_mov_b32 s0, s2 1647; CISI-NEXT: s_mov_b32 s1, s3 1648; CISI-NEXT: s_mov_b32 s2, s6 1649; CISI-NEXT: s_mov_b32 s3, s7 1650; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1651; CISI-NEXT: s_waitcnt expcnt(0) 1652; CISI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1653; CISI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1654; CISI-NEXT: s_endpgm 1655; 1656; VI-LABEL: vusubo64: 1657; VI: ; %bb.0: 1658; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1659; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1660; VI-NEXT: s_waitcnt lgkmcnt(0) 1661; VI-NEXT: v_mov_b32_e32 v1, s0 1662; VI-NEXT: v_mov_b32_e32 v6, s5 1663; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v0 1664; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc 1665; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[5:6] 1666; VI-NEXT: v_mov_b32_e32 v2, s1 1667; VI-NEXT: v_mov_b32_e32 v3, s2 1668; VI-NEXT: v_mov_b32_e32 v4, s3 1669; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1670; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] 1671; VI-NEXT: flat_store_byte v[3:4], v0 1672; VI-NEXT: s_endpgm 1673; 1674; GFX9-LABEL: vusubo64: 1675; GFX9: ; %bb.0: 1676; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1677; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1678; GFX9-NEXT: v_mov_b32_e32 v2, 0 1679; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1680; GFX9-NEXT: v_mov_b32_e32 v1, s7 1681; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 1682; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc 1683; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] 1684; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1685; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1686; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 1687; GFX9-NEXT: s_endpgm 1688; 1689; GFX1010-LABEL: vusubo64: 1690; GFX1010: ; %bb.0: 1691; GFX1010-NEXT: s_clause 0x1 1692; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1693; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1694; GFX1010-NEXT: v_mov_b32_e32 v2, 0 1695; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 1696; GFX1010-NEXT: v_sub_co_u32 v0, s4, s6, v0 1697; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4 1698; GFX1010-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] 1699; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 1700; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1701; GFX1010-NEXT: global_store_byte v2, v3, s[2:3] 1702; GFX1010-NEXT: s_endpgm 1703; 1704; GFX1030W32-LABEL: vusubo64: 1705; GFX1030W32: ; %bb.0: 1706; GFX1030W32-NEXT: s_clause 0x1 1707; GFX1030W32-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1708; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1709; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 1710; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 1711; GFX1030W32-NEXT: v_sub_co_u32 v0, s4, s6, v0 1712; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4 1713; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] 1714; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 1715; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1716; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3] 1717; GFX1030W32-NEXT: s_endpgm 1718; 1719; GFX1030W64-LABEL: vusubo64: 1720; GFX1030W64: ; %bb.0: 1721; GFX1030W64-NEXT: s_clause 0x1 1722; GFX1030W64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1723; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1724; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 1725; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 1726; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s6, v0 1727; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[4:5] 1728; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] 1729; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 1730; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1731; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3] 1732; GFX1030W64-NEXT: s_endpgm 1733; 1734; GFX11-LABEL: vusubo64: 1735; GFX11: ; %bb.0: 1736; GFX11-NEXT: s_clause 0x1 1737; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1738; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1739; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1740; GFX11-NEXT: v_mov_b32_e32 v2, 0 1741; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1742; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1743; GFX11-NEXT: v_sub_co_u32 v0, s4, s6, v0 1744; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s4 1745; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1746; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[6:7], v[0:1] 1747; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo 1748; GFX11-NEXT: s_clause 0x1 1749; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1750; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] 1751; GFX11-NEXT: s_endpgm 1752 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1753 %tid.ext = sext i32 %tid to i64 1754 %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %tid.ext) 1755 %val = extractvalue { i64, i1 } %usub, 0 1756 %carry = extractvalue { i64, i1 } %usub, 1 1757 store i64 %val, ptr addrspace(1) %out, align 8 1758 store i1 %carry, ptr addrspace(1) %carryout 1759 ret void 1760} 1761 1762; GCN-ISEL-LABEL: name: sudiv64 1763; GCN-ISEL-LABEL: body: 1764; GCN-ISEL-LABEL: bb.3 1765; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 1766; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]] 1767; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 1768; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]] 1769 1770define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { 1771; CISI-LABEL: sudiv64: 1772; CISI: ; %bb.0: 1773; CISI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1774; CISI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xd 1775; CISI-NEXT: s_waitcnt lgkmcnt(0) 1776; CISI-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] 1777; CISI-NEXT: s_mov_b32 s0, 0 1778; CISI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0 1779; CISI-NEXT: s_and_b64 vcc, exec, s[0:1] 1780; CISI-NEXT: s_cbranch_vccz .LBB16_4 1781; CISI-NEXT: ; %bb.1: 1782; CISI-NEXT: v_cvt_f32_u32_e32 v0, s2 1783; CISI-NEXT: v_cvt_f32_u32_e32 v1, s3 1784; CISI-NEXT: s_sub_u32 s0, 0, s2 1785; CISI-NEXT: s_subb_u32 s1, 0, s3 1786; CISI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 1787; CISI-NEXT: v_rcp_f32_e32 v0, v0 1788; CISI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 1789; CISI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 1790; CISI-NEXT: v_trunc_f32_e32 v1, v1 1791; CISI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 1792; CISI-NEXT: v_cvt_u32_f32_e32 v1, v1 1793; CISI-NEXT: v_cvt_u32_f32_e32 v0, v0 1794; CISI-NEXT: v_mul_lo_u32 v2, s0, v1 1795; CISI-NEXT: v_mul_hi_u32 v3, s0, v0 1796; CISI-NEXT: v_mul_lo_u32 v5, s1, v0 1797; CISI-NEXT: v_mul_lo_u32 v4, s0, v0 1798; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1799; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v5 1800; CISI-NEXT: v_mul_hi_u32 v3, v0, v4 1801; CISI-NEXT: v_mul_lo_u32 v5, v0, v2 1802; CISI-NEXT: v_mul_hi_u32 v7, v0, v2 1803; CISI-NEXT: v_mul_lo_u32 v6, v1, v4 1804; CISI-NEXT: v_mul_hi_u32 v4, v1, v4 1805; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v5 1806; CISI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 1807; CISI-NEXT: v_mul_hi_u32 v7, v1, v2 1808; CISI-NEXT: v_mul_lo_u32 v2, v1, v2 1809; CISI-NEXT: v_add_i32_e32 v3, vcc, v3, v6 1810; CISI-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 1811; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 1812; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1813; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 1814; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1815; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1816; CISI-NEXT: v_mul_lo_u32 v2, s0, v1 1817; CISI-NEXT: v_mul_hi_u32 v3, s0, v0 1818; CISI-NEXT: v_mul_lo_u32 v4, s1, v0 1819; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1820; CISI-NEXT: v_mul_lo_u32 v3, s0, v0 1821; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1822; CISI-NEXT: v_mul_lo_u32 v6, v0, v2 1823; CISI-NEXT: v_mul_hi_u32 v7, v0, v3 1824; CISI-NEXT: v_mul_hi_u32 v8, v0, v2 1825; CISI-NEXT: v_mul_hi_u32 v5, v1, v3 1826; CISI-NEXT: v_mul_lo_u32 v3, v1, v3 1827; CISI-NEXT: v_mul_hi_u32 v4, v1, v2 1828; CISI-NEXT: v_add_i32_e32 v6, vcc, v7, v6 1829; CISI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 1830; CISI-NEXT: v_mul_lo_u32 v2, v1, v2 1831; CISI-NEXT: v_add_i32_e32 v3, vcc, v6, v3 1832; CISI-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 1833; CISI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 1834; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1835; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 1836; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1837; CISI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 1838; CISI-NEXT: v_mul_lo_u32 v2, s10, v1 1839; CISI-NEXT: v_mul_hi_u32 v3, s10, v0 1840; CISI-NEXT: v_mul_hi_u32 v4, s10, v1 1841; CISI-NEXT: v_mul_hi_u32 v5, s11, v1 1842; CISI-NEXT: v_mul_lo_u32 v1, s11, v1 1843; CISI-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1844; CISI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 1845; CISI-NEXT: v_mul_lo_u32 v4, s11, v0 1846; CISI-NEXT: v_mul_hi_u32 v0, s11, v0 1847; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1848; CISI-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 1849; CISI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 1850; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1851; CISI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 1852; CISI-NEXT: v_mul_lo_u32 v2, s2, v1 1853; CISI-NEXT: v_mul_hi_u32 v3, s2, v0 1854; CISI-NEXT: v_mul_lo_u32 v4, s3, v0 1855; CISI-NEXT: v_mov_b32_e32 v5, s3 1856; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1857; CISI-NEXT: v_mul_lo_u32 v3, s2, v0 1858; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1859; CISI-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 1860; CISI-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 1861; CISI-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 1862; CISI-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 1863; CISI-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 1864; CISI-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 1865; CISI-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 1866; CISI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 1867; CISI-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 1868; CISI-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 1869; CISI-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 1870; CISI-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 1871; CISI-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 1872; CISI-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 1873; CISI-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 1874; CISI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 1875; CISI-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] 1876; CISI-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] 1877; CISI-NEXT: v_mov_b32_e32 v6, s11 1878; CISI-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 1879; CISI-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 1880; CISI-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 1881; CISI-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 1882; CISI-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 1883; CISI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 1884; CISI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 1885; CISI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 1886; CISI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1887; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1888; CISI-NEXT: s_cbranch_execnz .LBB16_3 1889; CISI-NEXT: .LBB16_2: 1890; CISI-NEXT: v_cvt_f32_u32_e32 v0, s2 1891; CISI-NEXT: s_sub_i32 s0, 0, s2 1892; CISI-NEXT: v_rcp_iflag_f32_e32 v0, v0 1893; CISI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1894; CISI-NEXT: v_cvt_u32_f32_e32 v0, v0 1895; CISI-NEXT: v_mul_lo_u32 v1, s0, v0 1896; CISI-NEXT: v_mul_hi_u32 v1, v0, v1 1897; CISI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1898; CISI-NEXT: v_mul_hi_u32 v0, s10, v0 1899; CISI-NEXT: v_readfirstlane_b32 s0, v0 1900; CISI-NEXT: s_mul_i32 s0, s0, s2 1901; CISI-NEXT: s_sub_i32 s0, s10, s0 1902; CISI-NEXT: s_sub_i32 s1, s0, s2 1903; CISI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 1904; CISI-NEXT: s_cmp_ge_u32 s0, s2 1905; CISI-NEXT: s_cselect_b64 vcc, -1, 0 1906; CISI-NEXT: s_cselect_b32 s0, s1, s0 1907; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 1908; CISI-NEXT: v_add_i32_e32 v1, vcc, 1, v0 1909; CISI-NEXT: s_cmp_ge_u32 s0, s2 1910; CISI-NEXT: s_cselect_b64 vcc, -1, 0 1911; CISI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 1912; CISI-NEXT: v_mov_b32_e32 v1, 0 1913; CISI-NEXT: .LBB16_3: 1914; CISI-NEXT: s_mov_b32 s11, 0xf000 1915; CISI-NEXT: s_mov_b32 s10, -1 1916; CISI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 1917; CISI-NEXT: s_endpgm 1918; CISI-NEXT: .LBB16_4: 1919; CISI-NEXT: ; implicit-def: $vgpr0_vgpr1 1920; CISI-NEXT: s_branch .LBB16_2 1921; 1922; VI-LABEL: sudiv64: 1923; VI: ; %bb.0: 1924; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1925; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 1926; VI-NEXT: s_waitcnt lgkmcnt(0) 1927; VI-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] 1928; VI-NEXT: s_mov_b32 s0, 0 1929; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 1930; VI-NEXT: s_cbranch_scc0 .LBB16_4 1931; VI-NEXT: ; %bb.1: 1932; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 1933; VI-NEXT: v_cvt_f32_u32_e32 v1, s3 1934; VI-NEXT: s_sub_u32 s4, 0, s2 1935; VI-NEXT: s_subb_u32 s5, 0, s3 1936; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 1937; VI-NEXT: v_rcp_f32_e32 v0, v0 1938; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 1939; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 1940; VI-NEXT: v_trunc_f32_e32 v1, v1 1941; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 1942; VI-NEXT: v_cvt_u32_f32_e32 v4, v1 1943; VI-NEXT: v_cvt_u32_f32_e32 v5, v0 1944; VI-NEXT: v_mul_lo_u32 v2, s4, v4 1945; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v5, 0 1946; VI-NEXT: v_mul_lo_u32 v3, s5, v5 1947; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 1948; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 1949; VI-NEXT: v_mul_hi_u32 v6, v5, v0 1950; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 1951; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 1952; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0 1953; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc 1954; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0 1955; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0 1956; VI-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc 1957; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1958; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 1959; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1960; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0 1961; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc 1962; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v6, 0 1963; VI-NEXT: v_mul_lo_u32 v4, s4, v7 1964; VI-NEXT: v_mul_lo_u32 v5, s5, v6 1965; VI-NEXT: v_mul_hi_u32 v8, v6, v0 1966; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 1967; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 1968; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5 1969; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0 1970; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0 1971; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 1972; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1973; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 1974; VI-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc 1975; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1976; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 1977; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1978; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v0 1979; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc 1980; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 1981; VI-NEXT: v_mul_hi_u32 v4, s10, v2 1982; VI-NEXT: v_readfirstlane_b32 s4, v1 1983; VI-NEXT: v_readfirstlane_b32 s5, v0 1984; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s11, v3, 0 1985; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s11, v2, 0 1986; VI-NEXT: v_readfirstlane_b32 s6, v4 1987; VI-NEXT: s_add_u32 s0, s6, s5 1988; VI-NEXT: s_addc_u32 s1, 0, s4 1989; VI-NEXT: v_readfirstlane_b32 s6, v2 1990; VI-NEXT: v_readfirstlane_b32 s5, v3 1991; VI-NEXT: s_add_u32 s0, s0, s6 1992; VI-NEXT: v_readfirstlane_b32 s4, v1 1993; VI-NEXT: s_addc_u32 s0, s1, s5 1994; VI-NEXT: s_addc_u32 s6, s4, 0 1995; VI-NEXT: v_readfirstlane_b32 s1, v0 1996; VI-NEXT: s_add_u32 s7, s0, s1 1997; VI-NEXT: v_mov_b32_e32 v2, s7 1998; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0 1999; VI-NEXT: s_addc_u32 s6, 0, s6 2000; VI-NEXT: s_mul_i32 s0, s2, s6 2001; VI-NEXT: v_readfirstlane_b32 s1, v1 2002; VI-NEXT: s_add_i32 s0, s1, s0 2003; VI-NEXT: s_mul_i32 s1, s3, s7 2004; VI-NEXT: s_add_i32 s12, s0, s1 2005; VI-NEXT: s_sub_i32 s0, s11, s12 2006; VI-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 2007; VI-NEXT: s_cmp_lg_u64 vcc, 0 2008; VI-NEXT: s_subb_u32 s13, s0, s3 2009; VI-NEXT: v_subrev_u32_e64 v1, s[0:1], s2, v0 2010; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 2011; VI-NEXT: s_subb_u32 s13, s13, 0 2012; VI-NEXT: s_cmp_ge_u32 s13, s3 2013; VI-NEXT: s_cselect_b32 s14, -1, 0 2014; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 2015; VI-NEXT: s_cmp_eq_u32 s13, s3 2016; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] 2017; VI-NEXT: v_mov_b32_e32 v3, s14 2018; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 2019; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] 2020; VI-NEXT: s_add_u32 s0, s7, 1 2021; VI-NEXT: s_addc_u32 s13, s6, 0 2022; VI-NEXT: s_add_u32 s1, s7, 2 2023; VI-NEXT: s_addc_u32 s7, s6, 0 2024; VI-NEXT: v_mov_b32_e32 v3, s0 2025; VI-NEXT: v_mov_b32_e32 v4, s1 2026; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 2027; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2028; VI-NEXT: v_mov_b32_e32 v1, s13 2029; VI-NEXT: v_mov_b32_e32 v4, s7 2030; VI-NEXT: s_cmp_lg_u64 vcc, 0 2031; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2032; VI-NEXT: s_subb_u32 s0, s11, s12 2033; VI-NEXT: s_cmp_ge_u32 s0, s3 2034; VI-NEXT: s_cselect_b32 s1, -1, 0 2035; VI-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2036; VI-NEXT: s_cmp_eq_u32 s0, s3 2037; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2038; VI-NEXT: v_mov_b32_e32 v4, s1 2039; VI-NEXT: s_cselect_b64 vcc, -1, 0 2040; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 2041; VI-NEXT: v_mov_b32_e32 v4, s6 2042; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 2043; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 2044; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc 2045; VI-NEXT: s_cbranch_execnz .LBB16_3 2046; VI-NEXT: .LBB16_2: 2047; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 2048; VI-NEXT: s_sub_i32 s0, 0, s2 2049; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 2050; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2051; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 2052; VI-NEXT: v_mul_lo_u32 v1, s0, v0 2053; VI-NEXT: v_mul_hi_u32 v1, v0, v1 2054; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 2055; VI-NEXT: v_mul_hi_u32 v0, s10, v0 2056; VI-NEXT: v_readfirstlane_b32 s0, v0 2057; VI-NEXT: s_mul_i32 s0, s0, s2 2058; VI-NEXT: s_sub_i32 s0, s10, s0 2059; VI-NEXT: s_sub_i32 s1, s0, s2 2060; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 2061; VI-NEXT: s_cmp_ge_u32 s0, s2 2062; VI-NEXT: s_cselect_b64 vcc, -1, 0 2063; VI-NEXT: s_cselect_b32 s0, s1, s0 2064; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2065; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 2066; VI-NEXT: s_cmp_ge_u32 s0, s2 2067; VI-NEXT: s_cselect_b64 vcc, -1, 0 2068; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2069; VI-NEXT: v_mov_b32_e32 v1, 0 2070; VI-NEXT: .LBB16_3: 2071; VI-NEXT: v_mov_b32_e32 v2, s8 2072; VI-NEXT: v_mov_b32_e32 v3, s9 2073; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2074; VI-NEXT: s_endpgm 2075; VI-NEXT: .LBB16_4: 2076; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 2077; VI-NEXT: s_branch .LBB16_2 2078; 2079; GFX9-LABEL: sudiv64: 2080; GFX9: ; %bb.0: 2081; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2082; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 2083; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2084; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] 2085; GFX9-NEXT: s_mov_b32 s0, 0 2086; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 2087; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 2088; GFX9-NEXT: ; %bb.1: 2089; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2090; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2091; GFX9-NEXT: s_sub_u32 s0, 0, s2 2092; GFX9-NEXT: s_subb_u32 s1, 0, s3 2093; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 2094; GFX9-NEXT: v_rcp_f32_e32 v0, v0 2095; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 2096; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 2097; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2098; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 2099; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2100; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2101; GFX9-NEXT: v_readfirstlane_b32 s6, v1 2102; GFX9-NEXT: v_readfirstlane_b32 s7, v0 2103; GFX9-NEXT: s_mul_i32 s12, s0, s6 2104; GFX9-NEXT: s_mul_hi_u32 s14, s0, s7 2105; GFX9-NEXT: s_mul_i32 s13, s1, s7 2106; GFX9-NEXT: s_add_i32 s12, s14, s12 2107; GFX9-NEXT: s_add_i32 s12, s12, s13 2108; GFX9-NEXT: s_mul_i32 s15, s0, s7 2109; GFX9-NEXT: s_mul_hi_u32 s13, s7, s12 2110; GFX9-NEXT: s_mul_i32 s14, s7, s12 2111; GFX9-NEXT: s_mul_hi_u32 s7, s7, s15 2112; GFX9-NEXT: s_add_u32 s7, s7, s14 2113; GFX9-NEXT: s_addc_u32 s13, 0, s13 2114; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 2115; GFX9-NEXT: s_mul_i32 s15, s6, s15 2116; GFX9-NEXT: s_add_u32 s7, s7, s15 2117; GFX9-NEXT: s_mul_hi_u32 s14, s6, s12 2118; GFX9-NEXT: s_addc_u32 s7, s13, s16 2119; GFX9-NEXT: s_addc_u32 s13, s14, 0 2120; GFX9-NEXT: s_mul_i32 s12, s6, s12 2121; GFX9-NEXT: s_add_u32 s7, s7, s12 2122; GFX9-NEXT: s_addc_u32 s12, 0, s13 2123; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s7, v0 2124; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 2125; GFX9-NEXT: s_addc_u32 s6, s6, s12 2126; GFX9-NEXT: v_readfirstlane_b32 s12, v0 2127; GFX9-NEXT: s_mul_i32 s7, s0, s6 2128; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 2129; GFX9-NEXT: s_add_i32 s7, s13, s7 2130; GFX9-NEXT: s_mul_i32 s1, s1, s12 2131; GFX9-NEXT: s_add_i32 s7, s7, s1 2132; GFX9-NEXT: s_mul_i32 s0, s0, s12 2133; GFX9-NEXT: s_mul_hi_u32 s13, s6, s0 2134; GFX9-NEXT: s_mul_i32 s14, s6, s0 2135; GFX9-NEXT: s_mul_i32 s16, s12, s7 2136; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 2137; GFX9-NEXT: s_mul_hi_u32 s15, s12, s7 2138; GFX9-NEXT: s_add_u32 s0, s0, s16 2139; GFX9-NEXT: s_addc_u32 s12, 0, s15 2140; GFX9-NEXT: s_add_u32 s0, s0, s14 2141; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 2142; GFX9-NEXT: s_addc_u32 s0, s12, s13 2143; GFX9-NEXT: s_addc_u32 s1, s1, 0 2144; GFX9-NEXT: s_mul_i32 s7, s6, s7 2145; GFX9-NEXT: s_add_u32 s0, s0, s7 2146; GFX9-NEXT: s_addc_u32 s1, 0, s1 2147; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 2148; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 2149; GFX9-NEXT: s_addc_u32 s0, s6, s1 2150; GFX9-NEXT: v_readfirstlane_b32 s7, v0 2151; GFX9-NEXT: s_mul_i32 s6, s10, s0 2152; GFX9-NEXT: s_mul_hi_u32 s12, s10, s7 2153; GFX9-NEXT: s_mul_hi_u32 s1, s10, s0 2154; GFX9-NEXT: s_add_u32 s6, s12, s6 2155; GFX9-NEXT: s_addc_u32 s1, 0, s1 2156; GFX9-NEXT: s_mul_hi_u32 s13, s11, s7 2157; GFX9-NEXT: s_mul_i32 s7, s11, s7 2158; GFX9-NEXT: s_add_u32 s6, s6, s7 2159; GFX9-NEXT: s_mul_hi_u32 s12, s11, s0 2160; GFX9-NEXT: s_addc_u32 s1, s1, s13 2161; GFX9-NEXT: s_addc_u32 s6, s12, 0 2162; GFX9-NEXT: s_mul_i32 s0, s11, s0 2163; GFX9-NEXT: s_add_u32 s7, s1, s0 2164; GFX9-NEXT: s_addc_u32 s6, 0, s6 2165; GFX9-NEXT: s_mul_i32 s0, s2, s6 2166; GFX9-NEXT: s_mul_hi_u32 s1, s2, s7 2167; GFX9-NEXT: s_add_i32 s0, s1, s0 2168; GFX9-NEXT: s_mul_i32 s1, s3, s7 2169; GFX9-NEXT: s_add_i32 s12, s0, s1 2170; GFX9-NEXT: s_mul_i32 s1, s2, s7 2171; GFX9-NEXT: v_mov_b32_e32 v0, s1 2172; GFX9-NEXT: s_sub_i32 s0, s11, s12 2173; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 2174; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 2175; GFX9-NEXT: s_subb_u32 s13, s0, s3 2176; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 2177; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 2178; GFX9-NEXT: s_subb_u32 s13, s13, 0 2179; GFX9-NEXT: s_cmp_ge_u32 s13, s3 2180; GFX9-NEXT: s_cselect_b32 s14, -1, 0 2181; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 2182; GFX9-NEXT: s_cmp_eq_u32 s13, s3 2183; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] 2184; GFX9-NEXT: v_mov_b32_e32 v2, s14 2185; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 2186; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] 2187; GFX9-NEXT: s_add_u32 s0, s7, 1 2188; GFX9-NEXT: s_addc_u32 s13, s6, 0 2189; GFX9-NEXT: s_add_u32 s1, s7, 2 2190; GFX9-NEXT: s_addc_u32 s14, s6, 0 2191; GFX9-NEXT: v_mov_b32_e32 v2, s0 2192; GFX9-NEXT: v_mov_b32_e32 v3, s1 2193; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 2194; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 2195; GFX9-NEXT: v_mov_b32_e32 v1, s13 2196; GFX9-NEXT: v_mov_b32_e32 v3, s14 2197; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 2198; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 2199; GFX9-NEXT: s_subb_u32 s0, s11, s12 2200; GFX9-NEXT: s_cmp_ge_u32 s0, s3 2201; GFX9-NEXT: s_cselect_b32 s1, -1, 0 2202; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2203; GFX9-NEXT: s_cmp_eq_u32 s0, s3 2204; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2205; GFX9-NEXT: v_mov_b32_e32 v3, s1 2206; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 2207; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 2208; GFX9-NEXT: v_mov_b32_e32 v3, s6 2209; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 2210; GFX9-NEXT: v_mov_b32_e32 v0, s7 2211; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 2212; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2213; GFX9-NEXT: s_cbranch_execnz .LBB16_3 2214; GFX9-NEXT: .LBB16_2: 2215; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2216; GFX9-NEXT: s_sub_i32 s0, 0, s2 2217; GFX9-NEXT: s_mov_b32 s1, 0 2218; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2219; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2220; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2221; GFX9-NEXT: v_readfirstlane_b32 s3, v0 2222; GFX9-NEXT: s_mul_i32 s0, s0, s3 2223; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 2224; GFX9-NEXT: s_add_i32 s3, s3, s0 2225; GFX9-NEXT: s_mul_hi_u32 s0, s10, s3 2226; GFX9-NEXT: s_mul_i32 s4, s0, s2 2227; GFX9-NEXT: s_sub_i32 s4, s10, s4 2228; GFX9-NEXT: s_add_i32 s3, s0, 1 2229; GFX9-NEXT: s_sub_i32 s5, s4, s2 2230; GFX9-NEXT: s_cmp_ge_u32 s4, s2 2231; GFX9-NEXT: s_cselect_b32 s0, s3, s0 2232; GFX9-NEXT: s_cselect_b32 s4, s5, s4 2233; GFX9-NEXT: s_add_i32 s3, s0, 1 2234; GFX9-NEXT: s_cmp_ge_u32 s4, s2 2235; GFX9-NEXT: s_cselect_b32 s0, s3, s0 2236; GFX9-NEXT: v_mov_b32_e32 v0, s0 2237; GFX9-NEXT: v_mov_b32_e32 v1, s1 2238; GFX9-NEXT: .LBB16_3: 2239; GFX9-NEXT: v_mov_b32_e32 v2, 0 2240; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 2241; GFX9-NEXT: s_endpgm 2242; GFX9-NEXT: .LBB16_4: 2243; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 2244; GFX9-NEXT: s_branch .LBB16_2 2245; 2246; GFX1010-LABEL: sudiv64: 2247; GFX1010: ; %bb.0: 2248; GFX1010-NEXT: s_clause 0x1 2249; GFX1010-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2250; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 2251; GFX1010-NEXT: s_waitcnt lgkmcnt(0) 2252; GFX1010-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] 2253; GFX1010-NEXT: s_mov_b32 s4, 0 2254; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0 2255; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 2256; GFX1010-NEXT: ; %bb.1: 2257; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 2258; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 2259; GFX1010-NEXT: s_sub_u32 s5, 0, s2 2260; GFX1010-NEXT: s_subb_u32 s6, 0, s3 2261; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 2262; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 2263; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 2264; GFX1010-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 2265; GFX1010-NEXT: v_trunc_f32_e32 v1, v1 2266; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 2267; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1 2268; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 2269; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 2270; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 2271; GFX1010-NEXT: s_mul_i32 s7, s5, s0 2272; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s1 2273; GFX1010-NEXT: s_mul_i32 s12, s6, s1 2274; GFX1010-NEXT: s_add_i32 s7, s13, s7 2275; GFX1010-NEXT: s_mul_i32 s14, s5, s1 2276; GFX1010-NEXT: s_add_i32 s7, s7, s12 2277; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 2278; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 2279; GFX1010-NEXT: s_mul_i32 s12, s0, s14 2280; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 2281; GFX1010-NEXT: s_mul_i32 s1, s1, s7 2282; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s7 2283; GFX1010-NEXT: s_add_u32 s1, s13, s1 2284; GFX1010-NEXT: s_addc_u32 s13, 0, s14 2285; GFX1010-NEXT: s_add_u32 s1, s1, s12 2286; GFX1010-NEXT: s_mul_i32 s7, s0, s7 2287; GFX1010-NEXT: s_addc_u32 s1, s13, s15 2288; GFX1010-NEXT: s_addc_u32 s12, s16, 0 2289; GFX1010-NEXT: s_add_u32 s1, s1, s7 2290; GFX1010-NEXT: s_addc_u32 s7, 0, s12 2291; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 2292; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 2293; GFX1010-NEXT: s_addc_u32 s0, s0, s7 2294; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 2295; GFX1010-NEXT: s_mul_i32 s7, s5, s0 2296; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s1 2297; GFX1010-NEXT: s_mul_i32 s6, s6, s1 2298; GFX1010-NEXT: s_add_i32 s7, s12, s7 2299; GFX1010-NEXT: s_mul_i32 s5, s5, s1 2300; GFX1010-NEXT: s_add_i32 s7, s7, s6 2301; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s5 2302; GFX1010-NEXT: s_mul_i32 s13, s0, s5 2303; GFX1010-NEXT: s_mul_hi_u32 s5, s1, s5 2304; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 2305; GFX1010-NEXT: s_mul_i32 s1, s1, s7 2306; GFX1010-NEXT: s_mul_hi_u32 s6, s0, s7 2307; GFX1010-NEXT: s_add_u32 s1, s5, s1 2308; GFX1010-NEXT: s_addc_u32 s5, 0, s14 2309; GFX1010-NEXT: s_add_u32 s1, s1, s13 2310; GFX1010-NEXT: s_mul_i32 s7, s0, s7 2311; GFX1010-NEXT: s_addc_u32 s1, s5, s12 2312; GFX1010-NEXT: s_addc_u32 s5, s6, 0 2313; GFX1010-NEXT: s_add_u32 s1, s1, s7 2314; GFX1010-NEXT: s_addc_u32 s5, 0, s5 2315; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 2316; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 2317; GFX1010-NEXT: s_addc_u32 s0, s0, s5 2318; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 2319; GFX1010-NEXT: s_mul_i32 s6, s10, s0 2320; GFX1010-NEXT: s_mul_hi_u32 s5, s10, s0 2321; GFX1010-NEXT: s_mul_hi_u32 s7, s11, s0 2322; GFX1010-NEXT: s_mul_i32 s0, s11, s0 2323; GFX1010-NEXT: s_mul_hi_u32 s12, s10, s1 2324; GFX1010-NEXT: s_mul_hi_u32 s13, s11, s1 2325; GFX1010-NEXT: s_mul_i32 s1, s11, s1 2326; GFX1010-NEXT: s_add_u32 s6, s12, s6 2327; GFX1010-NEXT: s_addc_u32 s5, 0, s5 2328; GFX1010-NEXT: s_add_u32 s1, s6, s1 2329; GFX1010-NEXT: s_addc_u32 s1, s5, s13 2330; GFX1010-NEXT: s_addc_u32 s5, s7, 0 2331; GFX1010-NEXT: s_add_u32 s1, s1, s0 2332; GFX1010-NEXT: s_addc_u32 s5, 0, s5 2333; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 2334; GFX1010-NEXT: s_mul_i32 s7, s2, s5 2335; GFX1010-NEXT: s_mul_i32 s12, s2, s1 2336; GFX1010-NEXT: s_add_i32 s0, s0, s7 2337; GFX1010-NEXT: v_sub_co_u32 v0, s7, s10, s12 2338; GFX1010-NEXT: s_mul_i32 s6, s3, s1 2339; GFX1010-NEXT: s_add_i32 s0, s0, s6 2340; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 2341; GFX1010-NEXT: s_sub_i32 s6, s11, s0 2342; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 2343; GFX1010-NEXT: s_subb_u32 s6, s6, s3 2344; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 2345; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 2346; GFX1010-NEXT: s_subb_u32 s6, s6, 0 2347; GFX1010-NEXT: s_cmp_ge_u32 s6, s3 2348; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2349; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 2350; GFX1010-NEXT: s_cmp_eq_u32 s6, s3 2351; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 2352; GFX1010-NEXT: s_add_u32 s6, s1, 1 2353; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2354; GFX1010-NEXT: s_addc_u32 s12, s5, 0 2355; GFX1010-NEXT: s_add_u32 s13, s1, 2 2356; GFX1010-NEXT: s_addc_u32 s14, s5, 0 2357; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 2358; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 2359; GFX1010-NEXT: s_subb_u32 s0, s11, s0 2360; GFX1010-NEXT: v_mov_b32_e32 v2, s13 2361; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 2362; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2363; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 2364; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 2365; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 2366; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 2367; GFX1010-NEXT: v_mov_b32_e32 v1, s14 2368; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 2369; GFX1010-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo 2370; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2371; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 2372; GFX1010-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo 2373; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo 2374; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 2375; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 2376; GFX1010-NEXT: .LBB16_2: 2377; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 2378; GFX1010-NEXT: s_sub_i32 s1, 0, s2 2379; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 2380; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2381; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 2382; GFX1010-NEXT: v_readfirstlane_b32 s0, v0 2383; GFX1010-NEXT: s_mul_i32 s1, s1, s0 2384; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 2385; GFX1010-NEXT: s_add_i32 s0, s0, s1 2386; GFX1010-NEXT: s_mul_hi_u32 s0, s10, s0 2387; GFX1010-NEXT: s_mul_i32 s1, s0, s2 2388; GFX1010-NEXT: s_add_i32 s3, s0, 1 2389; GFX1010-NEXT: s_sub_i32 s1, s10, s1 2390; GFX1010-NEXT: s_sub_i32 s4, s1, s2 2391; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 2392; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 2393; GFX1010-NEXT: s_cselect_b32 s1, s4, s1 2394; GFX1010-NEXT: s_add_i32 s3, s0, 1 2395; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 2396; GFX1010-NEXT: s_mov_b32 s1, 0 2397; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 2398; GFX1010-NEXT: v_mov_b32_e32 v0, s0 2399; GFX1010-NEXT: v_mov_b32_e32 v1, s1 2400; GFX1010-NEXT: .LBB16_3: 2401; GFX1010-NEXT: v_mov_b32_e32 v2, 0 2402; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 2403; GFX1010-NEXT: s_endpgm 2404; GFX1010-NEXT: .LBB16_4: 2405; GFX1010-NEXT: ; implicit-def: $vgpr0_vgpr1 2406; GFX1010-NEXT: s_branch .LBB16_2 2407; 2408; GFX1030W32-LABEL: sudiv64: 2409; GFX1030W32: ; %bb.0: 2410; GFX1030W32-NEXT: s_clause 0x1 2411; GFX1030W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2412; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 2413; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) 2414; GFX1030W32-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] 2415; GFX1030W32-NEXT: s_mov_b32 s4, 0 2416; GFX1030W32-NEXT: s_cmp_lg_u64 s[4:5], 0 2417; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4 2418; GFX1030W32-NEXT: ; %bb.1: 2419; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 2420; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s3 2421; GFX1030W32-NEXT: s_sub_u32 s5, 0, s2 2422; GFX1030W32-NEXT: s_subb_u32 s6, 0, s3 2423; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 2424; GFX1030W32-NEXT: v_rcp_f32_e32 v0, v0 2425; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 2426; GFX1030W32-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 2427; GFX1030W32-NEXT: v_trunc_f32_e32 v1, v1 2428; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 2429; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1 2430; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 2431; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v1 2432; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 2433; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 2434; GFX1030W32-NEXT: s_mul_hi_u32 s13, s5, s1 2435; GFX1030W32-NEXT: s_mul_i32 s12, s6, s1 2436; GFX1030W32-NEXT: s_add_i32 s7, s13, s7 2437; GFX1030W32-NEXT: s_mul_i32 s14, s5, s1 2438; GFX1030W32-NEXT: s_add_i32 s7, s7, s12 2439; GFX1030W32-NEXT: s_mul_hi_u32 s13, s1, s14 2440; GFX1030W32-NEXT: s_mul_hi_u32 s15, s0, s14 2441; GFX1030W32-NEXT: s_mul_i32 s12, s0, s14 2442; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 2443; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 2444; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s7 2445; GFX1030W32-NEXT: s_add_u32 s1, s13, s1 2446; GFX1030W32-NEXT: s_addc_u32 s13, 0, s14 2447; GFX1030W32-NEXT: s_add_u32 s1, s1, s12 2448; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 2449; GFX1030W32-NEXT: s_addc_u32 s1, s13, s15 2450; GFX1030W32-NEXT: s_addc_u32 s12, s16, 0 2451; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 2452; GFX1030W32-NEXT: s_addc_u32 s7, 0, s12 2453; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 2454; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 2455; GFX1030W32-NEXT: s_addc_u32 s0, s0, s7 2456; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 2457; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 2458; GFX1030W32-NEXT: s_mul_hi_u32 s12, s5, s1 2459; GFX1030W32-NEXT: s_mul_i32 s6, s6, s1 2460; GFX1030W32-NEXT: s_add_i32 s7, s12, s7 2461; GFX1030W32-NEXT: s_mul_i32 s5, s5, s1 2462; GFX1030W32-NEXT: s_add_i32 s7, s7, s6 2463; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s5 2464; GFX1030W32-NEXT: s_mul_i32 s13, s0, s5 2465; GFX1030W32-NEXT: s_mul_hi_u32 s5, s1, s5 2466; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 2467; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 2468; GFX1030W32-NEXT: s_mul_hi_u32 s6, s0, s7 2469; GFX1030W32-NEXT: s_add_u32 s1, s5, s1 2470; GFX1030W32-NEXT: s_addc_u32 s5, 0, s14 2471; GFX1030W32-NEXT: s_add_u32 s1, s1, s13 2472; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 2473; GFX1030W32-NEXT: s_addc_u32 s1, s5, s12 2474; GFX1030W32-NEXT: s_addc_u32 s5, s6, 0 2475; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 2476; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 2477; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 2478; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 2479; GFX1030W32-NEXT: s_addc_u32 s0, s0, s5 2480; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 2481; GFX1030W32-NEXT: s_mul_i32 s6, s10, s0 2482; GFX1030W32-NEXT: s_mul_hi_u32 s5, s10, s0 2483; GFX1030W32-NEXT: s_mul_hi_u32 s7, s11, s0 2484; GFX1030W32-NEXT: s_mul_i32 s0, s11, s0 2485; GFX1030W32-NEXT: s_mul_hi_u32 s12, s10, s1 2486; GFX1030W32-NEXT: s_mul_hi_u32 s13, s11, s1 2487; GFX1030W32-NEXT: s_mul_i32 s1, s11, s1 2488; GFX1030W32-NEXT: s_add_u32 s6, s12, s6 2489; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 2490; GFX1030W32-NEXT: s_add_u32 s1, s6, s1 2491; GFX1030W32-NEXT: s_addc_u32 s1, s5, s13 2492; GFX1030W32-NEXT: s_addc_u32 s5, s7, 0 2493; GFX1030W32-NEXT: s_add_u32 s1, s1, s0 2494; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 2495; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1 2496; GFX1030W32-NEXT: s_mul_i32 s7, s2, s5 2497; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1 2498; GFX1030W32-NEXT: s_add_i32 s0, s0, s7 2499; GFX1030W32-NEXT: v_sub_co_u32 v0, s7, s10, s12 2500; GFX1030W32-NEXT: s_mul_i32 s6, s3, s1 2501; GFX1030W32-NEXT: s_add_i32 s0, s0, s6 2502; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2 2503; GFX1030W32-NEXT: s_sub_i32 s6, s11, s0 2504; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 2505; GFX1030W32-NEXT: s_subb_u32 s6, s6, s3 2506; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 2507; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 2508; GFX1030W32-NEXT: s_subb_u32 s6, s6, 0 2509; GFX1030W32-NEXT: s_cmp_ge_u32 s6, s3 2510; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2511; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 2512; GFX1030W32-NEXT: s_cmp_eq_u32 s6, s3 2513; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0 2514; GFX1030W32-NEXT: s_add_u32 s6, s1, 1 2515; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2516; GFX1030W32-NEXT: s_addc_u32 s12, s5, 0 2517; GFX1030W32-NEXT: s_add_u32 s13, s1, 2 2518; GFX1030W32-NEXT: s_addc_u32 s14, s5, 0 2519; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 2520; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 2521; GFX1030W32-NEXT: s_subb_u32 s0, s11, s0 2522; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13 2523; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3 2524; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2525; GFX1030W32-NEXT: s_cselect_b32 s7, -1, 0 2526; GFX1030W32-NEXT: s_cmp_eq_u32 s0, s3 2527; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 2528; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0 2529; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14 2530; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 2531; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo 2532; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2533; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 2534; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo 2535; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo 2536; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 2537; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 2538; GFX1030W32-NEXT: .LBB16_2: 2539; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 2540; GFX1030W32-NEXT: s_sub_i32 s1, 0, s2 2541; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0 2542; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2543; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 2544; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v0 2545; GFX1030W32-NEXT: s_mul_i32 s1, s1, s0 2546; GFX1030W32-NEXT: s_mul_hi_u32 s1, s0, s1 2547; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 2548; GFX1030W32-NEXT: s_mul_hi_u32 s0, s10, s0 2549; GFX1030W32-NEXT: s_mul_i32 s1, s0, s2 2550; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 2551; GFX1030W32-NEXT: s_sub_i32 s1, s10, s1 2552; GFX1030W32-NEXT: s_sub_i32 s4, s1, s2 2553; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 2554; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0 2555; GFX1030W32-NEXT: s_cselect_b32 s1, s4, s1 2556; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 2557; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 2558; GFX1030W32-NEXT: s_mov_b32 s1, 0 2559; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0 2560; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0 2561; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1 2562; GFX1030W32-NEXT: .LBB16_3: 2563; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 2564; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 2565; GFX1030W32-NEXT: s_endpgm 2566; GFX1030W32-NEXT: .LBB16_4: 2567; GFX1030W32-NEXT: ; implicit-def: $vgpr0_vgpr1 2568; GFX1030W32-NEXT: s_branch .LBB16_2 2569; 2570; GFX1030W64-LABEL: sudiv64: 2571; GFX1030W64: ; %bb.0: 2572; GFX1030W64-NEXT: s_clause 0x1 2573; GFX1030W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 2574; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 2575; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) 2576; GFX1030W64-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] 2577; GFX1030W64-NEXT: s_mov_b32 s0, 0 2578; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2579; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4 2580; GFX1030W64-NEXT: ; %bb.1: 2581; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 2582; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3 2583; GFX1030W64-NEXT: s_sub_u32 s5, 0, s2 2584; GFX1030W64-NEXT: s_subb_u32 s6, 0, s3 2585; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 2586; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 2587; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 2588; GFX1030W64-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 2589; GFX1030W64-NEXT: v_trunc_f32_e32 v1, v1 2590; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 2591; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 2592; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 2593; GFX1030W64-NEXT: v_readfirstlane_b32 s4, v1 2594; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 2595; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 2596; GFX1030W64-NEXT: s_mul_hi_u32 s12, s5, s0 2597; GFX1030W64-NEXT: s_mul_i32 s7, s6, s0 2598; GFX1030W64-NEXT: s_add_i32 s1, s12, s1 2599; GFX1030W64-NEXT: s_mul_i32 s13, s5, s0 2600; GFX1030W64-NEXT: s_add_i32 s1, s1, s7 2601; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13 2602; GFX1030W64-NEXT: s_mul_hi_u32 s14, s4, s13 2603; GFX1030W64-NEXT: s_mul_i32 s7, s4, s13 2604; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 2605; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 2606; GFX1030W64-NEXT: s_mul_hi_u32 s15, s4, s1 2607; GFX1030W64-NEXT: s_add_u32 s0, s12, s0 2608; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13 2609; GFX1030W64-NEXT: s_add_u32 s0, s0, s7 2610; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 2611; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14 2612; GFX1030W64-NEXT: s_addc_u32 s7, s15, 0 2613; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 2614; GFX1030W64-NEXT: s_addc_u32 s7, 0, s7 2615; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 2616; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2617; GFX1030W64-NEXT: s_addc_u32 s4, s4, s7 2618; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 2619; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 2620; GFX1030W64-NEXT: s_mul_hi_u32 s7, s5, s0 2621; GFX1030W64-NEXT: s_mul_i32 s6, s6, s0 2622; GFX1030W64-NEXT: s_add_i32 s1, s7, s1 2623; GFX1030W64-NEXT: s_mul_i32 s5, s5, s0 2624; GFX1030W64-NEXT: s_add_i32 s1, s1, s6 2625; GFX1030W64-NEXT: s_mul_hi_u32 s7, s4, s5 2626; GFX1030W64-NEXT: s_mul_i32 s12, s4, s5 2627; GFX1030W64-NEXT: s_mul_hi_u32 s5, s0, s5 2628; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 2629; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 2630; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s1 2631; GFX1030W64-NEXT: s_add_u32 s0, s5, s0 2632; GFX1030W64-NEXT: s_addc_u32 s5, 0, s13 2633; GFX1030W64-NEXT: s_add_u32 s0, s0, s12 2634; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 2635; GFX1030W64-NEXT: s_addc_u32 s0, s5, s7 2636; GFX1030W64-NEXT: s_addc_u32 s5, s6, 0 2637; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 2638; GFX1030W64-NEXT: s_addc_u32 s5, 0, s5 2639; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 2640; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2641; GFX1030W64-NEXT: s_addc_u32 s0, s4, s5 2642; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0 2643; GFX1030W64-NEXT: s_mul_i32 s5, s10, s0 2644; GFX1030W64-NEXT: s_mul_hi_u32 s4, s10, s0 2645; GFX1030W64-NEXT: s_mul_hi_u32 s6, s11, s0 2646; GFX1030W64-NEXT: s_mul_i32 s0, s11, s0 2647; GFX1030W64-NEXT: s_mul_hi_u32 s7, s10, s1 2648; GFX1030W64-NEXT: s_mul_hi_u32 s12, s11, s1 2649; GFX1030W64-NEXT: s_mul_i32 s1, s11, s1 2650; GFX1030W64-NEXT: s_add_u32 s5, s7, s5 2651; GFX1030W64-NEXT: s_addc_u32 s4, 0, s4 2652; GFX1030W64-NEXT: s_add_u32 s1, s5, s1 2653; GFX1030W64-NEXT: s_addc_u32 s1, s4, s12 2654; GFX1030W64-NEXT: s_addc_u32 s4, s6, 0 2655; GFX1030W64-NEXT: s_add_u32 s6, s1, s0 2656; GFX1030W64-NEXT: s_addc_u32 s7, 0, s4 2657; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s6 2658; GFX1030W64-NEXT: s_mul_i32 s1, s2, s7 2659; GFX1030W64-NEXT: s_mul_i32 s5, s2, s6 2660; GFX1030W64-NEXT: s_add_i32 s12, s0, s1 2661; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s10, s5 2662; GFX1030W64-NEXT: s_mul_i32 s4, s3, s6 2663; GFX1030W64-NEXT: s_add_i32 s12, s12, s4 2664; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], v0, s2 2665; GFX1030W64-NEXT: s_sub_i32 s13, s11, s12 2666; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2667; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3 2668; GFX1030W64-NEXT: s_cmp_lg_u64 s[4:5], 0 2669; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 2670; GFX1030W64-NEXT: s_subb_u32 s4, s13, 0 2671; GFX1030W64-NEXT: s_cmp_ge_u32 s4, s3 2672; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 2673; GFX1030W64-NEXT: s_cselect_b32 s5, -1, 0 2674; GFX1030W64-NEXT: s_cmp_eq_u32 s4, s3 2675; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0 2676; GFX1030W64-NEXT: s_add_u32 s4, s6, 1 2677; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc 2678; GFX1030W64-NEXT: s_addc_u32 s5, s7, 0 2679; GFX1030W64-NEXT: s_add_u32 s13, s6, 2 2680; GFX1030W64-NEXT: s_addc_u32 s14, s7, 0 2681; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 2682; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2683; GFX1030W64-NEXT: s_subb_u32 s0, s11, s12 2684; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13 2685; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3 2686; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 2687; GFX1030W64-NEXT: s_cselect_b32 s11, -1, 0 2688; GFX1030W64-NEXT: s_cmp_eq_u32 s0, s3 2689; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 2690; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0 2691; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14 2692; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s11, v0, s[0:1] 2693; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc 2694; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc 2695; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 2696; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc 2697; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc 2698; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 2699; GFX1030W64-NEXT: .LBB16_2: 2700; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 2701; GFX1030W64-NEXT: s_sub_i32 s1, 0, s2 2702; GFX1030W64-NEXT: v_rcp_iflag_f32_e32 v0, v0 2703; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2704; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 2705; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 2706; GFX1030W64-NEXT: s_mul_i32 s1, s1, s0 2707; GFX1030W64-NEXT: s_mul_hi_u32 s1, s0, s1 2708; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 2709; GFX1030W64-NEXT: s_mul_hi_u32 s0, s10, s0 2710; GFX1030W64-NEXT: s_mul_i32 s1, s0, s2 2711; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 2712; GFX1030W64-NEXT: s_sub_i32 s1, s10, s1 2713; GFX1030W64-NEXT: s_sub_i32 s4, s1, s2 2714; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 2715; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0 2716; GFX1030W64-NEXT: s_cselect_b32 s1, s4, s1 2717; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 2718; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 2719; GFX1030W64-NEXT: s_mov_b32 s1, 0 2720; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0 2721; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0 2722; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 2723; GFX1030W64-NEXT: .LBB16_3: 2724; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 2725; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 2726; GFX1030W64-NEXT: s_endpgm 2727; GFX1030W64-NEXT: .LBB16_4: 2728; GFX1030W64-NEXT: ; implicit-def: $vgpr0_vgpr1 2729; GFX1030W64-NEXT: s_branch .LBB16_2 2730; 2731; GFX11-LABEL: sudiv64: 2732; GFX11: ; %bb.0: 2733; GFX11-NEXT: s_clause 0x1 2734; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 2735; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 2736; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2737; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] 2738; GFX11-NEXT: s_mov_b32 s4, 0 2739; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2740; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 2741; GFX11-NEXT: s_cbranch_scc0 .LBB16_4 2742; GFX11-NEXT: ; %bb.1: 2743; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 2744; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3 2745; GFX11-NEXT: s_sub_u32 s5, 0, s2 2746; GFX11-NEXT: s_subb_u32 s6, 0, s3 2747; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2748; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 2749; GFX11-NEXT: v_rcp_f32_e32 v0, v0 2750; GFX11-NEXT: s_waitcnt_depctr 0xfff 2751; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 2752; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2753; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 2754; GFX11-NEXT: v_trunc_f32_e32 v1, v1 2755; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 2756; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 2757; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 2758; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 2759; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 2760; GFX11-NEXT: v_readfirstlane_b32 s0, v1 2761; GFX11-NEXT: v_readfirstlane_b32 s1, v0 2762; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 2763; GFX11-NEXT: s_mul_i32 s7, s5, s0 2764; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1 2765; GFX11-NEXT: s_mul_i32 s12, s6, s1 2766; GFX11-NEXT: s_add_i32 s7, s13, s7 2767; GFX11-NEXT: s_mul_i32 s14, s5, s1 2768; GFX11-NEXT: s_add_i32 s7, s7, s12 2769; GFX11-NEXT: s_mul_hi_u32 s13, s1, s14 2770; GFX11-NEXT: s_mul_hi_u32 s15, s0, s14 2771; GFX11-NEXT: s_mul_i32 s12, s0, s14 2772; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 2773; GFX11-NEXT: s_mul_i32 s1, s1, s7 2774; GFX11-NEXT: s_mul_hi_u32 s16, s0, s7 2775; GFX11-NEXT: s_add_u32 s1, s13, s1 2776; GFX11-NEXT: s_addc_u32 s13, 0, s14 2777; GFX11-NEXT: s_add_u32 s1, s1, s12 2778; GFX11-NEXT: s_mul_i32 s7, s0, s7 2779; GFX11-NEXT: s_addc_u32 s1, s13, s15 2780; GFX11-NEXT: s_addc_u32 s12, s16, 0 2781; GFX11-NEXT: s_add_u32 s1, s1, s7 2782; GFX11-NEXT: s_addc_u32 s7, 0, s12 2783; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 2784; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2785; GFX11-NEXT: s_cmp_lg_u32 s1, 0 2786; GFX11-NEXT: s_addc_u32 s0, s0, s7 2787; GFX11-NEXT: v_readfirstlane_b32 s1, v0 2788; GFX11-NEXT: s_mul_i32 s7, s5, s0 2789; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2790; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1 2791; GFX11-NEXT: s_mul_i32 s6, s6, s1 2792; GFX11-NEXT: s_add_i32 s7, s12, s7 2793; GFX11-NEXT: s_mul_i32 s5, s5, s1 2794; GFX11-NEXT: s_add_i32 s7, s7, s6 2795; GFX11-NEXT: s_mul_hi_u32 s12, s0, s5 2796; GFX11-NEXT: s_mul_i32 s13, s0, s5 2797; GFX11-NEXT: s_mul_hi_u32 s5, s1, s5 2798; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 2799; GFX11-NEXT: s_mul_i32 s1, s1, s7 2800; GFX11-NEXT: s_mul_hi_u32 s6, s0, s7 2801; GFX11-NEXT: s_add_u32 s1, s5, s1 2802; GFX11-NEXT: s_addc_u32 s5, 0, s14 2803; GFX11-NEXT: s_add_u32 s1, s1, s13 2804; GFX11-NEXT: s_mul_i32 s7, s0, s7 2805; GFX11-NEXT: s_addc_u32 s1, s5, s12 2806; GFX11-NEXT: s_addc_u32 s5, s6, 0 2807; GFX11-NEXT: s_add_u32 s1, s1, s7 2808; GFX11-NEXT: s_addc_u32 s5, 0, s5 2809; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 2810; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 2811; GFX11-NEXT: s_cmp_lg_u32 s1, 0 2812; GFX11-NEXT: s_addc_u32 s0, s0, s5 2813; GFX11-NEXT: v_readfirstlane_b32 s1, v0 2814; GFX11-NEXT: s_mul_i32 s6, s10, s0 2815; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0 2816; GFX11-NEXT: s_mul_hi_u32 s7, s11, s0 2817; GFX11-NEXT: s_mul_i32 s0, s11, s0 2818; GFX11-NEXT: s_mul_hi_u32 s12, s10, s1 2819; GFX11-NEXT: s_mul_hi_u32 s13, s11, s1 2820; GFX11-NEXT: s_mul_i32 s1, s11, s1 2821; GFX11-NEXT: s_add_u32 s6, s12, s6 2822; GFX11-NEXT: s_addc_u32 s5, 0, s5 2823; GFX11-NEXT: s_add_u32 s1, s6, s1 2824; GFX11-NEXT: s_addc_u32 s1, s5, s13 2825; GFX11-NEXT: s_addc_u32 s5, s7, 0 2826; GFX11-NEXT: s_add_u32 s1, s1, s0 2827; GFX11-NEXT: s_addc_u32 s5, 0, s5 2828; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1 2829; GFX11-NEXT: s_mul_i32 s7, s2, s5 2830; GFX11-NEXT: s_mul_i32 s12, s2, s1 2831; GFX11-NEXT: s_add_i32 s0, s0, s7 2832; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12 2833; GFX11-NEXT: s_mul_i32 s6, s3, s1 2834; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2835; GFX11-NEXT: s_add_i32 s0, s0, s6 2836; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2 2837; GFX11-NEXT: s_sub_i32 s6, s11, s0 2838; GFX11-NEXT: s_cmp_lg_u32 s7, 0 2839; GFX11-NEXT: s_subb_u32 s6, s6, s3 2840; GFX11-NEXT: s_cmp_lg_u32 s12, 0 2841; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 2842; GFX11-NEXT: s_subb_u32 s6, s6, 0 2843; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2844; GFX11-NEXT: s_cmp_ge_u32 s6, s3 2845; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo 2846; GFX11-NEXT: s_cselect_b32 s12, -1, 0 2847; GFX11-NEXT: s_cmp_eq_u32 s6, s3 2848; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 2849; GFX11-NEXT: s_add_u32 s6, s1, 1 2850; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2851; GFX11-NEXT: s_addc_u32 s12, s5, 0 2852; GFX11-NEXT: s_add_u32 s13, s1, 2 2853; GFX11-NEXT: s_addc_u32 s14, s5, 0 2854; GFX11-NEXT: v_mov_b32_e32 v2, s13 2855; GFX11-NEXT: s_cmp_lg_u32 s7, 0 2856; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 2857; GFX11-NEXT: s_subb_u32 s0, s11, s0 2858; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2859; GFX11-NEXT: s_cmp_ge_u32 s0, s3 2860; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo 2861; GFX11-NEXT: s_cselect_b32 s7, -1, 0 2862; GFX11-NEXT: s_cmp_eq_u32 s0, s3 2863; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 2864; GFX11-NEXT: s_cselect_b32 s0, -1, 0 2865; GFX11-NEXT: v_mov_b32_e32 v1, s14 2866; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 2867; GFX11-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo 2868; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 2869; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo 2870; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 2871; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) 2872; GFX11-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo 2873; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo 2874; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 2875; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 2876; GFX11-NEXT: .LBB16_2: 2877; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 2878; GFX11-NEXT: s_sub_i32 s1, 0, s2 2879; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 2880; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 2881; GFX11-NEXT: s_waitcnt_depctr 0xfff 2882; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2883; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 2884; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 2885; GFX11-NEXT: v_readfirstlane_b32 s0, v0 2886; GFX11-NEXT: s_mul_i32 s1, s1, s0 2887; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2888; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 2889; GFX11-NEXT: s_add_i32 s0, s0, s1 2890; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 2891; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0 2892; GFX11-NEXT: s_mul_i32 s1, s0, s2 2893; GFX11-NEXT: s_add_i32 s3, s0, 1 2894; GFX11-NEXT: s_sub_i32 s1, s10, s1 2895; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2896; GFX11-NEXT: s_sub_i32 s4, s1, s2 2897; GFX11-NEXT: s_cmp_ge_u32 s1, s2 2898; GFX11-NEXT: s_cselect_b32 s0, s3, s0 2899; GFX11-NEXT: s_cselect_b32 s1, s4, s1 2900; GFX11-NEXT: s_add_i32 s3, s0, 1 2901; GFX11-NEXT: s_cmp_ge_u32 s1, s2 2902; GFX11-NEXT: s_mov_b32 s1, 0 2903; GFX11-NEXT: s_cselect_b32 s0, s3, s0 2904; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 2905; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 2906; GFX11-NEXT: .LBB16_3: 2907; GFX11-NEXT: v_mov_b32_e32 v2, 0 2908; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9] 2909; GFX11-NEXT: s_endpgm 2910; GFX11-NEXT: .LBB16_4: 2911; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 2912; GFX11-NEXT: s_branch .LBB16_2 2913 %result = udiv i64 %x, %y 2914 store i64 %result, ptr addrspace(1) %out 2915 ret void 2916} 2917 2918 2919 2920declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 2921 2922declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 2923 2924declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1 2925 2926declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 2927 2928declare i32 @llvm.amdgcn.workitem.id.x() #1 2929 2930attributes #0 = { nounwind } 2931attributes #1 = { nounwind readnone } 2932 2933;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 2934; GCN-ISEL: {{.*}} 2935