1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5 6define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { 7; SI-LABEL: s_uaddo_i64_zext: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 10; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 11; SI-NEXT: s_mov_b32 s7, 0xf000 12; SI-NEXT: s_mov_b32 s6, -1 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b32 s4, s0 15; SI-NEXT: s_mov_b32 s5, s1 16; SI-NEXT: s_add_u32 s0, s2, s8 17; SI-NEXT: v_mov_b32_e32 v0, s2 18; SI-NEXT: v_mov_b32_e32 v1, s3 19; SI-NEXT: s_addc_u32 s1, s3, s9 20; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 21; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 22; SI-NEXT: v_mov_b32_e32 v1, s1 23; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 24; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: s_uaddo_i64_zext: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 31; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 32; VI-NEXT: s_waitcnt lgkmcnt(0) 33; VI-NEXT: v_mov_b32_e32 v0, s0 34; VI-NEXT: s_add_u32 s0, s2, s4 35; VI-NEXT: v_mov_b32_e32 v2, s2 36; VI-NEXT: v_mov_b32_e32 v1, s1 37; VI-NEXT: v_mov_b32_e32 v3, s3 38; VI-NEXT: s_addc_u32 s1, s3, s5 39; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] 40; VI-NEXT: v_mov_b32_e32 v3, s1 41; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 42; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 43; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 44; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 45; VI-NEXT: s_endpgm 46; 47; GFX9-LABEL: s_uaddo_i64_zext: 48; GFX9: ; %bb.0: 49; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 50; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 51; GFX9-NEXT: v_mov_b32_e32 v2, 0 52; GFX9-NEXT: s_waitcnt lgkmcnt(0) 53; GFX9-NEXT: v_mov_b32_e32 v0, s2 54; GFX9-NEXT: s_add_u32 s4, s2, s6 55; GFX9-NEXT: v_mov_b32_e32 v1, s3 56; GFX9-NEXT: s_addc_u32 s5, s3, s7 57; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] 58; GFX9-NEXT: v_mov_b32_e32 v1, s5 59; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 60; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 61; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 62; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 63; GFX9-NEXT: s_endpgm 64 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 65 %val = extractvalue { i64, i1 } %uadd, 0 66 %carry = extractvalue { i64, i1 } %uadd, 1 67 %ext = zext i1 %carry to i64 68 %add2 = add i64 %val, %ext 69 store i64 %add2, ptr addrspace(1) %out, align 8 70 ret void 71} 72 73; FIXME: Could do scalar 74 75define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 76; SI-LABEL: s_uaddo_i32: 77; SI: ; %bb.0: 78; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 79; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd 80; SI-NEXT: s_mov_b32 s7, 0xf000 81; SI-NEXT: s_mov_b32 s6, -1 82; SI-NEXT: s_mov_b32 s10, s6 83; SI-NEXT: s_mov_b32 s11, s7 84; SI-NEXT: s_waitcnt lgkmcnt(0) 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_mov_b32 s8, s2 88; SI-NEXT: s_mov_b32 s9, s3 89; SI-NEXT: v_mov_b32_e32 v0, s13 90; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 91; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 92; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 93; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 94; SI-NEXT: s_endpgm 95; 96; VI-LABEL: s_uaddo_i32: 97; VI: ; %bb.0: 98; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 99; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: v_mov_b32_e32 v0, s0 102; VI-NEXT: v_mov_b32_e32 v4, s5 103; VI-NEXT: v_mov_b32_e32 v1, s1 104; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 105; VI-NEXT: v_mov_b32_e32 v2, s2 106; VI-NEXT: v_mov_b32_e32 v3, s3 107; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 108; VI-NEXT: flat_store_dword v[0:1], v4 109; VI-NEXT: flat_store_byte v[2:3], v5 110; VI-NEXT: s_endpgm 111; 112; GFX9-LABEL: s_uaddo_i32: 113; GFX9: ; %bb.0: 114; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 115; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 116; GFX9-NEXT: v_mov_b32_e32 v0, 0 117; GFX9-NEXT: s_waitcnt lgkmcnt(0) 118; GFX9-NEXT: v_mov_b32_e32 v1, s7 119; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s6, v1 120; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 121; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 122; GFX9-NEXT: global_store_byte v0, v2, s[2:3] 123; GFX9-NEXT: s_endpgm 124 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 125 %val = extractvalue { i32, i1 } %uadd, 0 126 %carry = extractvalue { i32, i1 } %uadd, 1 127 store i32 %val, ptr addrspace(1) %out, align 4 128 store i1 %carry, ptr addrspace(1) %carryout 129 ret void 130} 131 132define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 133; SI-LABEL: v_uaddo_i32: 134; SI: ; %bb.0: 135; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 136; SI-NEXT: s_mov_b32 s11, 0xf000 137; SI-NEXT: s_mov_b32 s10, -1 138; SI-NEXT: s_mov_b32 s14, s10 139; SI-NEXT: s_mov_b32 s15, s11 140; SI-NEXT: s_mov_b32 s18, s10 141; SI-NEXT: s_mov_b32 s19, s11 142; SI-NEXT: s_waitcnt lgkmcnt(0) 143; SI-NEXT: s_mov_b32 s12, s4 144; SI-NEXT: s_mov_b32 s13, s5 145; SI-NEXT: s_mov_b32 s16, s6 146; SI-NEXT: s_mov_b32 s17, s7 147; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 148; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 149; SI-NEXT: s_mov_b32 s6, s10 150; SI-NEXT: s_mov_b32 s7, s11 151; SI-NEXT: s_mov_b32 s8, s0 152; SI-NEXT: s_mov_b32 s9, s1 153; SI-NEXT: s_mov_b32 s4, s2 154; SI-NEXT: s_mov_b32 s5, s3 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 157; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 158; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 159; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 160; SI-NEXT: s_endpgm 161; 162; VI-LABEL: v_uaddo_i32: 163; VI: ; %bb.0: 164; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 165; VI-NEXT: s_waitcnt lgkmcnt(0) 166; VI-NEXT: v_mov_b32_e32 v0, s4 167; VI-NEXT: v_mov_b32_e32 v1, s5 168; VI-NEXT: v_mov_b32_e32 v2, s6 169; VI-NEXT: v_mov_b32_e32 v3, s7 170; VI-NEXT: flat_load_dword v4, v[0:1] 171; VI-NEXT: flat_load_dword v5, v[2:3] 172; VI-NEXT: v_mov_b32_e32 v0, s0 173; VI-NEXT: v_mov_b32_e32 v1, s1 174; VI-NEXT: v_mov_b32_e32 v2, s2 175; VI-NEXT: v_mov_b32_e32 v3, s3 176; VI-NEXT: s_waitcnt vmcnt(0) 177; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5 178; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 179; VI-NEXT: flat_store_dword v[0:1], v4 180; VI-NEXT: flat_store_byte v[2:3], v5 181; VI-NEXT: s_endpgm 182; 183; GFX9-LABEL: v_uaddo_i32: 184; GFX9: ; %bb.0: 185; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 186; GFX9-NEXT: v_mov_b32_e32 v0, 0 187; GFX9-NEXT: s_waitcnt lgkmcnt(0) 188; GFX9-NEXT: global_load_dword v1, v0, s[12:13] 189; GFX9-NEXT: global_load_dword v2, v0, s[14:15] 190; GFX9-NEXT: s_waitcnt vmcnt(0) 191; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 192; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 193; GFX9-NEXT: global_store_dword v0, v1, s[8:9] 194; GFX9-NEXT: global_store_byte v0, v2, s[10:11] 195; GFX9-NEXT: s_endpgm 196 %tid = call i32 @llvm.amdgcn.workitem.id.x() 197 %tid.ext = sext i32 %tid to i64 198 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr 199 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr 200 %a = load i32, ptr addrspace(1) %a.gep, align 4 201 %b = load i32, ptr addrspace(1) %b.gep, align 4 202 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 203 %val = extractvalue { i32, i1 } %uadd, 0 204 %carry = extractvalue { i32, i1 } %uadd, 1 205 store i32 %val, ptr addrspace(1) %out, align 4 206 store i1 %carry, ptr addrspace(1) %carryout 207 ret void 208} 209 210define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 211; SI-LABEL: v_uaddo_i32_novcc: 212; SI: ; %bb.0: 213; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 214; SI-NEXT: s_mov_b32 s11, 0xf000 215; SI-NEXT: s_mov_b32 s10, -1 216; SI-NEXT: s_mov_b32 s14, s10 217; SI-NEXT: s_mov_b32 s15, s11 218; SI-NEXT: s_mov_b32 s18, s10 219; SI-NEXT: s_mov_b32 s19, s11 220; SI-NEXT: s_waitcnt lgkmcnt(0) 221; SI-NEXT: s_mov_b32 s12, s4 222; SI-NEXT: s_mov_b32 s13, s5 223; SI-NEXT: s_mov_b32 s16, s6 224; SI-NEXT: s_mov_b32 s17, s7 225; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 226; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 227; SI-NEXT: s_mov_b32 s6, s10 228; SI-NEXT: s_mov_b32 s7, s11 229; SI-NEXT: s_mov_b32 s8, s0 230; SI-NEXT: s_mov_b32 s9, s1 231; SI-NEXT: s_mov_b32 s4, s2 232; SI-NEXT: s_mov_b32 s5, s3 233; SI-NEXT: s_waitcnt vmcnt(0) 234; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 235; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc 236; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 237; SI-NEXT: s_waitcnt vmcnt(0) 238; SI-NEXT: ;;#ASMSTART 239; SI-NEXT: ;;#ASMEND 240; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 241; SI-NEXT: s_waitcnt vmcnt(0) 242; SI-NEXT: s_endpgm 243; 244; VI-LABEL: v_uaddo_i32_novcc: 245; VI: ; %bb.0: 246; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: v_mov_b32_e32 v0, s4 249; VI-NEXT: v_mov_b32_e32 v1, s5 250; VI-NEXT: v_mov_b32_e32 v2, s6 251; VI-NEXT: v_mov_b32_e32 v3, s7 252; VI-NEXT: flat_load_dword v4, v[0:1] 253; VI-NEXT: flat_load_dword v5, v[2:3] 254; VI-NEXT: v_mov_b32_e32 v0, s0 255; VI-NEXT: v_mov_b32_e32 v1, s1 256; VI-NEXT: v_mov_b32_e32 v2, s2 257; VI-NEXT: v_mov_b32_e32 v3, s3 258; VI-NEXT: s_waitcnt vmcnt(0) 259; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5 260; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc 261; VI-NEXT: flat_store_dword v[0:1], v4 262; VI-NEXT: s_waitcnt vmcnt(0) 263; VI-NEXT: ;;#ASMSTART 264; VI-NEXT: ;;#ASMEND 265; VI-NEXT: flat_store_byte v[2:3], v5 266; VI-NEXT: s_waitcnt vmcnt(0) 267; VI-NEXT: s_endpgm 268; 269; GFX9-LABEL: v_uaddo_i32_novcc: 270; GFX9: ; %bb.0: 271; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 272; GFX9-NEXT: v_mov_b32_e32 v0, 0 273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 274; GFX9-NEXT: global_load_dword v1, v0, s[12:13] 275; GFX9-NEXT: global_load_dword v2, v0, s[14:15] 276; GFX9-NEXT: s_waitcnt vmcnt(0) 277; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2 278; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 279; GFX9-NEXT: global_store_dword v0, v1, s[8:9] 280; GFX9-NEXT: s_waitcnt vmcnt(0) 281; GFX9-NEXT: ;;#ASMSTART 282; GFX9-NEXT: ;;#ASMEND 283; GFX9-NEXT: global_store_byte v0, v2, s[10:11] 284; GFX9-NEXT: s_waitcnt vmcnt(0) 285; GFX9-NEXT: s_endpgm 286 %tid = call i32 @llvm.amdgcn.workitem.id.x() 287 %tid.ext = sext i32 %tid to i64 288 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr 289 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr 290 %a = load i32, ptr addrspace(1) %a.gep, align 4 291 %b = load i32, ptr addrspace(1) %b.gep, align 4 292 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 293 %val = extractvalue { i32, i1 } %uadd, 0 294 %carry = extractvalue { i32, i1 } %uadd, 1 295 store volatile i32 %val, ptr addrspace(1) %out, align 4 296 call void asm sideeffect "", "~{vcc}"() #0 297 store volatile i1 %carry, ptr addrspace(1) %carryout 298 ret void 299} 300 301define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 { 302; SI-LABEL: s_uaddo_i64: 303; SI: ; %bb.0: 304; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 305; SI-NEXT: s_mov_b32 s11, 0xf000 306; SI-NEXT: s_mov_b32 s10, -1 307; SI-NEXT: s_waitcnt lgkmcnt(0) 308; SI-NEXT: s_add_u32 s6, s4, s6 309; SI-NEXT: s_addc_u32 s7, s5, s7 310; SI-NEXT: s_mov_b32 s14, s10 311; SI-NEXT: s_mov_b32 s15, s11 312; SI-NEXT: s_mov_b32 s8, s0 313; SI-NEXT: s_mov_b32 s9, s1 314; SI-NEXT: s_mov_b32 s12, s2 315; SI-NEXT: s_mov_b32 s13, s3 316; SI-NEXT: v_mov_b32_e32 v0, s4 317; SI-NEXT: v_mov_b32_e32 v1, s5 318; SI-NEXT: v_mov_b32_e32 v2, s6 319; SI-NEXT: v_mov_b32_e32 v3, s7 320; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 321; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] 322; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 323; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 324; SI-NEXT: s_endpgm 325; 326; VI-LABEL: s_uaddo_i64: 327; VI: ; %bb.0: 328; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 329; VI-NEXT: s_waitcnt lgkmcnt(0) 330; VI-NEXT: v_mov_b32_e32 v0, s0 331; VI-NEXT: s_add_u32 s0, s4, s6 332; VI-NEXT: v_mov_b32_e32 v4, s4 333; VI-NEXT: v_mov_b32_e32 v1, s1 334; VI-NEXT: s_addc_u32 s1, s5, s7 335; VI-NEXT: v_mov_b32_e32 v5, s5 336; VI-NEXT: v_mov_b32_e32 v7, s1 337; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5] 338; VI-NEXT: v_mov_b32_e32 v6, s0 339; VI-NEXT: v_mov_b32_e32 v2, s2 340; VI-NEXT: v_mov_b32_e32 v3, s3 341; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7] 342; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 343; VI-NEXT: flat_store_byte v[2:3], v0 344; VI-NEXT: s_endpgm 345; 346; GFX9-LABEL: s_uaddo_i64: 347; GFX9: ; %bb.0: 348; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 349; GFX9-NEXT: v_mov_b32_e32 v4, 0 350; GFX9-NEXT: s_waitcnt lgkmcnt(0) 351; GFX9-NEXT: s_add_u32 s0, s12, s14 352; GFX9-NEXT: v_mov_b32_e32 v0, s12 353; GFX9-NEXT: v_mov_b32_e32 v1, s13 354; GFX9-NEXT: s_addc_u32 s1, s13, s15 355; GFX9-NEXT: v_mov_b32_e32 v3, s1 356; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] 357; GFX9-NEXT: v_mov_b32_e32 v2, s0 358; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 359; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] 360; GFX9-NEXT: global_store_byte v4, v0, s[10:11] 361; GFX9-NEXT: s_endpgm 362 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 363 %val = extractvalue { i64, i1 } %uadd, 0 364 %carry = extractvalue { i64, i1 } %uadd, 1 365 store i64 %val, ptr addrspace(1) %out, align 8 366 store i1 %carry, ptr addrspace(1) %carryout 367 ret void 368} 369 370define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 371; SI-LABEL: v_uaddo_i64: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 374; SI-NEXT: s_mov_b32 s11, 0xf000 375; SI-NEXT: s_mov_b32 s10, -1 376; SI-NEXT: s_mov_b32 s14, s10 377; SI-NEXT: s_mov_b32 s15, s11 378; SI-NEXT: s_mov_b32 s18, s10 379; SI-NEXT: s_mov_b32 s19, s11 380; SI-NEXT: s_waitcnt lgkmcnt(0) 381; SI-NEXT: s_mov_b32 s12, s4 382; SI-NEXT: s_mov_b32 s13, s5 383; SI-NEXT: s_mov_b32 s16, s6 384; SI-NEXT: s_mov_b32 s17, s7 385; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 386; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 387; SI-NEXT: s_mov_b32 s6, s10 388; SI-NEXT: s_mov_b32 s7, s11 389; SI-NEXT: s_mov_b32 s8, s0 390; SI-NEXT: s_mov_b32 s9, s1 391; SI-NEXT: s_mov_b32 s4, s2 392; SI-NEXT: s_mov_b32 s5, s3 393; SI-NEXT: s_waitcnt vmcnt(0) 394; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 395; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 396; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 397; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 398; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 399; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 400; SI-NEXT: s_endpgm 401; 402; VI-LABEL: v_uaddo_i64: 403; VI: ; %bb.0: 404; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 405; VI-NEXT: s_waitcnt lgkmcnt(0) 406; VI-NEXT: v_mov_b32_e32 v0, s4 407; VI-NEXT: v_mov_b32_e32 v1, s5 408; VI-NEXT: v_mov_b32_e32 v2, s6 409; VI-NEXT: v_mov_b32_e32 v3, s7 410; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 411; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 412; VI-NEXT: v_mov_b32_e32 v4, s0 413; VI-NEXT: v_mov_b32_e32 v5, s1 414; VI-NEXT: v_mov_b32_e32 v6, s2 415; VI-NEXT: v_mov_b32_e32 v7, s3 416; VI-NEXT: s_waitcnt vmcnt(0) 417; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 418; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc 419; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 420; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3] 421; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 422; VI-NEXT: flat_store_byte v[6:7], v0 423; VI-NEXT: s_endpgm 424; 425; GFX9-LABEL: v_uaddo_i64: 426; GFX9: ; %bb.0: 427; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 428; GFX9-NEXT: v_mov_b32_e32 v4, 0 429; GFX9-NEXT: s_waitcnt lgkmcnt(0) 430; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] 431; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] 432; GFX9-NEXT: s_waitcnt vmcnt(0) 433; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 434; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc 435; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] 436; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] 437; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 438; GFX9-NEXT: global_store_byte v4, v0, s[10:11] 439; GFX9-NEXT: s_endpgm 440 %tid = call i32 @llvm.amdgcn.workitem.id.x() 441 %tid.ext = sext i32 %tid to i64 442 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr 443 %b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr 444 %a = load i64, ptr addrspace(1) %a.gep 445 %b = load i64, ptr addrspace(1) %b.gep 446 %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) 447 %val = extractvalue { i64, i1 } %uadd, 0 448 %carry = extractvalue { i64, i1 } %uadd, 1 449 store i64 %val, ptr addrspace(1) %out 450 store i1 %carry, ptr addrspace(1) %carryout 451 ret void 452} 453 454define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 455; SI-LABEL: v_uaddo_i16: 456; SI: ; %bb.0: 457; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 458; SI-NEXT: s_mov_b32 s11, 0xf000 459; SI-NEXT: s_mov_b32 s10, -1 460; SI-NEXT: s_mov_b32 s14, s10 461; SI-NEXT: s_mov_b32 s15, s11 462; SI-NEXT: s_mov_b32 s18, s10 463; SI-NEXT: s_mov_b32 s19, s11 464; SI-NEXT: s_waitcnt lgkmcnt(0) 465; SI-NEXT: s_mov_b32 s12, s4 466; SI-NEXT: s_mov_b32 s13, s5 467; SI-NEXT: s_mov_b32 s16, s6 468; SI-NEXT: s_mov_b32 s17, s7 469; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 470; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 471; SI-NEXT: s_mov_b32 s6, s10 472; SI-NEXT: s_mov_b32 s7, s11 473; SI-NEXT: s_mov_b32 s8, s0 474; SI-NEXT: s_mov_b32 s9, s1 475; SI-NEXT: s_mov_b32 s4, s2 476; SI-NEXT: s_mov_b32 s5, s3 477; SI-NEXT: s_waitcnt vmcnt(0) 478; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 479; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 480; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 481; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 482; SI-NEXT: s_waitcnt expcnt(0) 483; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 484; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 485; SI-NEXT: s_endpgm 486; 487; VI-LABEL: v_uaddo_i16: 488; VI: ; %bb.0: 489; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 490; VI-NEXT: s_waitcnt lgkmcnt(0) 491; VI-NEXT: v_mov_b32_e32 v0, s4 492; VI-NEXT: v_mov_b32_e32 v1, s5 493; VI-NEXT: v_mov_b32_e32 v2, s6 494; VI-NEXT: v_mov_b32_e32 v3, s7 495; VI-NEXT: flat_load_ushort v4, v[0:1] 496; VI-NEXT: flat_load_ushort v5, v[2:3] 497; VI-NEXT: v_mov_b32_e32 v0, s0 498; VI-NEXT: v_mov_b32_e32 v1, s1 499; VI-NEXT: v_mov_b32_e32 v2, s2 500; VI-NEXT: v_mov_b32_e32 v3, s3 501; VI-NEXT: s_waitcnt vmcnt(0) 502; VI-NEXT: v_add_u32_e32 v5, vcc, v4, v5 503; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 504; VI-NEXT: v_and_b32_e32 v6, 0xffff, v5 505; VI-NEXT: v_cmp_lt_u32_e32 vcc, v6, v4 506; VI-NEXT: flat_store_short v[0:1], v5 507; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 508; VI-NEXT: flat_store_byte v[2:3], v0 509; VI-NEXT: s_endpgm 510; 511; GFX9-LABEL: v_uaddo_i16: 512; GFX9: ; %bb.0: 513; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 514; GFX9-NEXT: v_mov_b32_e32 v0, 0 515; GFX9-NEXT: s_waitcnt lgkmcnt(0) 516; GFX9-NEXT: global_load_ushort v1, v0, s[12:13] 517; GFX9-NEXT: global_load_ushort v2, v0, s[14:15] 518; GFX9-NEXT: s_waitcnt vmcnt(0) 519; GFX9-NEXT: v_add_u32_e32 v2, v1, v2 520; GFX9-NEXT: v_cmp_lt_u32_sdwa s[0:1], v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 521; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 522; GFX9-NEXT: global_store_short v0, v2, s[8:9] 523; GFX9-NEXT: global_store_byte v0, v1, s[10:11] 524; GFX9-NEXT: s_endpgm 525 %tid = call i32 @llvm.amdgcn.workitem.id.x() 526 %tid.ext = sext i32 %tid to i64 527 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr 528 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr 529 %a = load i16, ptr addrspace(1) %a.gep 530 %b = load i16, ptr addrspace(1) %b.gep 531 %uadd = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 %a, i16 %b) 532 %val = extractvalue { i16, i1 } %uadd, 0 533 %carry = extractvalue { i16, i1 } %uadd, 1 534 store i16 %val, ptr addrspace(1) %out 535 store i1 %carry, ptr addrspace(1) %carryout 536 ret void 537} 538 539define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { 540; SI-LABEL: v_uaddo_v2i32: 541; SI: ; %bb.0: 542; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 543; SI-NEXT: s_mov_b32 s11, 0xf000 544; SI-NEXT: s_mov_b32 s10, -1 545; SI-NEXT: s_mov_b32 s14, s10 546; SI-NEXT: s_mov_b32 s15, s11 547; SI-NEXT: s_mov_b32 s18, s10 548; SI-NEXT: s_mov_b32 s19, s11 549; SI-NEXT: s_waitcnt lgkmcnt(0) 550; SI-NEXT: s_mov_b32 s12, s4 551; SI-NEXT: s_mov_b32 s13, s5 552; SI-NEXT: s_mov_b32 s16, s6 553; SI-NEXT: s_mov_b32 s17, s7 554; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 555; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 556; SI-NEXT: s_mov_b32 s6, s10 557; SI-NEXT: s_mov_b32 s7, s11 558; SI-NEXT: s_mov_b32 s8, s0 559; SI-NEXT: s_mov_b32 s9, s1 560; SI-NEXT: s_mov_b32 s4, s2 561; SI-NEXT: s_mov_b32 s5, s3 562; SI-NEXT: s_waitcnt vmcnt(0) 563; SI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 564; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 565; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 566; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 567; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 568; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 569; SI-NEXT: s_endpgm 570; 571; VI-LABEL: v_uaddo_v2i32: 572; VI: ; %bb.0: 573; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 574; VI-NEXT: s_waitcnt lgkmcnt(0) 575; VI-NEXT: v_mov_b32_e32 v0, s4 576; VI-NEXT: v_mov_b32_e32 v1, s5 577; VI-NEXT: v_mov_b32_e32 v2, s6 578; VI-NEXT: v_mov_b32_e32 v3, s7 579; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 580; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 581; VI-NEXT: v_mov_b32_e32 v4, s0 582; VI-NEXT: v_mov_b32_e32 v5, s1 583; VI-NEXT: v_mov_b32_e32 v6, s2 584; VI-NEXT: v_mov_b32_e32 v7, s3 585; VI-NEXT: s_waitcnt vmcnt(0) 586; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 587; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 588; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 589; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 590; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 591; VI-NEXT: flat_store_dwordx2 v[6:7], v[2:3] 592; VI-NEXT: s_endpgm 593; 594; GFX9-LABEL: v_uaddo_v2i32: 595; GFX9: ; %bb.0: 596; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 597; GFX9-NEXT: v_mov_b32_e32 v4, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] 600; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] 601; GFX9-NEXT: s_waitcnt vmcnt(0) 602; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3 603; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 604; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 605; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 606; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] 607; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] 608; GFX9-NEXT: s_endpgm 609 %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 610 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 611 %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind 612 %val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0 613 %carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1 614 store <2 x i32> %val, ptr addrspace(1) %out, align 4 615 %carry.ext = zext <2 x i1> %carry to <2 x i32> 616 store <2 x i32> %carry.ext, ptr addrspace(1) %carryout 617 ret void 618} 619 620define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 { 621; SI-LABEL: s_uaddo_clamp_bit: 622; SI: ; %bb.0: ; %entry 623; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 624; SI-NEXT: s_waitcnt lgkmcnt(0) 625; SI-NEXT: v_mov_b32_e32 v0, s1 626; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 627; SI-NEXT: s_cmp_eq_u32 s0, s1 628; SI-NEXT: s_mov_b64 s[0:1], 0 629; SI-NEXT: s_cbranch_scc1 .LBB8_2 630; SI-NEXT: ; %bb.1: ; %if 631; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 632; SI-NEXT: .LBB8_2: ; %exit 633; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 634; SI-NEXT: s_mov_b32 s3, 0xf000 635; SI-NEXT: s_mov_b32 s2, -1 636; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 637; SI-NEXT: s_mov_b32 s10, s2 638; SI-NEXT: s_mov_b32 s11, s3 639; SI-NEXT: s_waitcnt lgkmcnt(0) 640; SI-NEXT: s_mov_b32 s0, s4 641; SI-NEXT: s_mov_b32 s1, s5 642; SI-NEXT: s_mov_b32 s8, s6 643; SI-NEXT: s_mov_b32 s9, s7 644; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 645; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 646; SI-NEXT: s_endpgm 647; 648; VI-LABEL: s_uaddo_clamp_bit: 649; VI: ; %bb.0: ; %entry 650; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 651; VI-NEXT: s_waitcnt lgkmcnt(0) 652; VI-NEXT: v_mov_b32_e32 v0, s1 653; VI-NEXT: s_cmp_eq_u32 s0, s1 654; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 655; VI-NEXT: s_mov_b64 s[0:1], 0 656; VI-NEXT: s_cbranch_scc1 .LBB8_2 657; VI-NEXT: ; %bb.1: ; %if 658; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 659; VI-NEXT: .LBB8_2: ; %exit 660; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 661; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] 662; VI-NEXT: s_waitcnt lgkmcnt(0) 663; VI-NEXT: v_mov_b32_e32 v1, s4 664; VI-NEXT: v_mov_b32_e32 v2, s5 665; VI-NEXT: v_mov_b32_e32 v3, s6 666; VI-NEXT: v_mov_b32_e32 v4, s7 667; VI-NEXT: flat_store_dword v[1:2], v0 668; VI-NEXT: flat_store_byte v[3:4], v5 669; VI-NEXT: s_endpgm 670; 671; GFX9-LABEL: s_uaddo_clamp_bit: 672; GFX9: ; %bb.0: ; %entry 673; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 674; GFX9-NEXT: s_waitcnt lgkmcnt(0) 675; GFX9-NEXT: v_mov_b32_e32 v0, s1 676; GFX9-NEXT: s_cmp_eq_u32 s0, s1 677; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 678; GFX9-NEXT: s_mov_b64 s[0:1], 0 679; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 680; GFX9-NEXT: ; %bb.1: ; %if 681; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 682; GFX9-NEXT: .LBB8_2: ; %exit 683; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 684; GFX9-NEXT: v_mov_b32_e32 v1, 0 685; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 686; GFX9-NEXT: s_waitcnt lgkmcnt(0) 687; GFX9-NEXT: global_store_dword v1, v0, s[8:9] 688; GFX9-NEXT: global_store_byte v1, v2, s[10:11] 689; GFX9-NEXT: s_endpgm 690entry: 691 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 692 %val = extractvalue { i32, i1 } %uadd, 0 693 %carry = extractvalue { i32, i1 } %uadd, 1 694 %c2 = icmp eq i1 %carry, false 695 %cc = icmp eq i32 %a, %b 696 br i1 %cc, label %exit, label %if 697 698if: 699 br label %exit 700 701exit: 702 %cout = phi i1 [false, %entry], [%c2, %if] 703 store i32 %val, ptr addrspace(1) %out, align 4 704 store i1 %cout, ptr addrspace(1) %carryout 705 ret void 706} 707 708define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { 709; SI-LABEL: v_uaddo_clamp_bit: 710; SI: ; %bb.0: ; %entry 711; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 712; SI-NEXT: s_mov_b32 s3, 0xf000 713; SI-NEXT: s_mov_b32 s2, -1 714; SI-NEXT: s_mov_b32 s14, s2 715; SI-NEXT: s_mov_b32 s15, s3 716; SI-NEXT: s_waitcnt lgkmcnt(0) 717; SI-NEXT: s_mov_b32 s0, s8 718; SI-NEXT: s_mov_b32 s1, s9 719; SI-NEXT: s_mov_b32 s12, s10 720; SI-NEXT: s_mov_b32 s13, s11 721; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 722; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 723; SI-NEXT: s_waitcnt vmcnt(0) 724; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 725; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 726; SI-NEXT: s_mov_b64 s[8:9], 0 727; SI-NEXT: s_cbranch_vccnz .LBB9_2 728; SI-NEXT: ; %bb.1: ; %if 729; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 730; SI-NEXT: .LBB9_2: ; %exit 731; SI-NEXT: s_mov_b32 s0, s4 732; SI-NEXT: s_mov_b32 s1, s5 733; SI-NEXT: s_mov_b32 s4, s6 734; SI-NEXT: s_mov_b32 s5, s7 735; SI-NEXT: s_mov_b32 s6, s2 736; SI-NEXT: s_mov_b32 s7, s3 737; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 738; SI-NEXT: s_waitcnt expcnt(0) 739; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] 740; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 741; SI-NEXT: s_endpgm 742; 743; VI-LABEL: v_uaddo_clamp_bit: 744; VI: ; %bb.0: ; %entry 745; VI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x24 746; VI-NEXT: s_mov_b64 s[2:3], 0 747; VI-NEXT: s_waitcnt lgkmcnt(0) 748; VI-NEXT: v_mov_b32_e32 v0, s8 749; VI-NEXT: v_mov_b32_e32 v1, s9 750; VI-NEXT: v_mov_b32_e32 v2, s10 751; VI-NEXT: v_mov_b32_e32 v3, s11 752; VI-NEXT: flat_load_dword v1, v[0:1] 753; VI-NEXT: flat_load_dword v2, v[2:3] 754; VI-NEXT: s_waitcnt vmcnt(0) 755; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 756; VI-NEXT: v_add_u32_e64 v0, s[0:1], v1, v2 757; VI-NEXT: s_cbranch_vccnz .LBB9_2 758; VI-NEXT: ; %bb.1: ; %if 759; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 760; VI-NEXT: .LBB9_2: ; %exit 761; VI-NEXT: v_mov_b32_e32 v1, s4 762; VI-NEXT: v_mov_b32_e32 v2, s5 763; VI-NEXT: v_mov_b32_e32 v3, s6 764; VI-NEXT: v_mov_b32_e32 v4, s7 765; VI-NEXT: flat_store_dword v[1:2], v0 766; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] 767; VI-NEXT: flat_store_byte v[3:4], v0 768; VI-NEXT: s_endpgm 769; 770; GFX9-LABEL: v_uaddo_clamp_bit: 771; GFX9: ; %bb.0: ; %entry 772; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 773; GFX9-NEXT: v_mov_b32_e32 v0, 0 774; GFX9-NEXT: s_mov_b64 s[2:3], 0 775; GFX9-NEXT: s_waitcnt lgkmcnt(0) 776; GFX9-NEXT: global_load_dword v2, v0, s[12:13] 777; GFX9-NEXT: global_load_dword v3, v0, s[14:15] 778; GFX9-NEXT: s_waitcnt vmcnt(0) 779; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 780; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3 781; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 782; GFX9-NEXT: ; %bb.1: ; %if 783; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 784; GFX9-NEXT: .LBB9_2: ; %exit 785; GFX9-NEXT: global_store_dword v0, v1, s[8:9] 786; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] 787; GFX9-NEXT: global_store_byte v0, v1, s[10:11] 788; GFX9-NEXT: s_endpgm 789entry: 790 %tid = call i32 @llvm.amdgcn.workitem.id.x() 791 %tid.ext = sext i32 %tid to i64 792 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr 793 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr 794 %a = load i32, ptr addrspace(1) %a.gep 795 %b = load i32, ptr addrspace(1) %b.gep 796 %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) 797 %val = extractvalue { i32, i1 } %uadd, 0 798 %carry = extractvalue { i32, i1 } %uadd, 1 799 %c2 = icmp eq i1 %carry, false 800 %cc = icmp eq i32 %a, %b 801 br i1 %cc, label %exit, label %if 802 803if: 804 br label %exit 805 806exit: 807 %cout = phi i1 [false, %entry], [%c2, %if] 808 store i32 %val, ptr addrspace(1) %out, align 4 809 store i1 %cout, ptr addrspace(1) %carryout 810 ret void 811} 812 813define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) { 814; SI-LABEL: sv_uaddo_i128: 815; SI: ; %bb.0: 816; SI-NEXT: s_mov_b32 s7, 0xf000 817; SI-NEXT: s_mov_b32 s6, 0 818; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 819; SI-NEXT: v_mov_b32_e32 v6, s1 820; SI-NEXT: v_mov_b32_e32 v7, s2 821; SI-NEXT: v_mov_b32_e32 v8, s3 822; SI-NEXT: s_mov_b32 s4, s6 823; SI-NEXT: s_mov_b32 s5, s6 824; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc 825; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc 826; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3] 827; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 828; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc 829; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] 830; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 831; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 832; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 833; SI-NEXT: v_and_b32_e32 v2, 1, v2 834; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 835; SI-NEXT: s_endpgm 836; 837; VI-LABEL: sv_uaddo_i128: 838; VI: ; %bb.0: 839; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 840; VI-NEXT: v_mov_b32_e32 v6, s1 841; VI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc 842; VI-NEXT: v_mov_b32_e32 v6, s2 843; VI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 844; VI-NEXT: v_mov_b32_e32 v6, s3 845; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 846; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] 847; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 848; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] 849; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 850; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 851; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 852; VI-NEXT: v_and_b32_e32 v2, 1, v2 853; VI-NEXT: flat_store_dword v[0:1], v2 854; VI-NEXT: s_endpgm 855; 856; GFX9-LABEL: sv_uaddo_i128: 857; GFX9: ; %bb.0: 858; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 859; GFX9-NEXT: v_mov_b32_e32 v6, s1 860; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 861; GFX9-NEXT: v_mov_b32_e32 v6, s2 862; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 863; GFX9-NEXT: v_mov_b32_e32 v6, s3 864; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 865; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] 866; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 867; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] 868; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc 869; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] 870; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 871; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 872; GFX9-NEXT: global_store_dword v[0:1], v2, off 873; GFX9-NEXT: s_endpgm 874 %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) 875 %carry = extractvalue { i128, i1 } %uadd, 1 876 %carry.ext = zext i1 %carry to i32 877 store i32 %carry.ext, ptr addrspace(1) %out 878 ret void 879} 880 881declare i32 @llvm.amdgcn.workitem.id.x() #1 882declare { i16, i1 } @llvm.uadd.with.overflow.i16(i16, i16) #1 883declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 884declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) #1 885declare { i128, i1 } @llvm.uadd.with.overflow.i128(i128, i128) #1 886declare { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 887 888 889attributes #0 = { nounwind } 890attributes #1 = { nounwind readnone } 891