1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable 8 9define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { 10; GFX6-LABEL: s_sub_i32: 11; GFX6: ; %bb.0: 12; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 13; GFX6-NEXT: s_mov_b32 s7, 0xf000 14; GFX6-NEXT: s_mov_b32 s6, -1 15; GFX6-NEXT: s_waitcnt lgkmcnt(0) 16; GFX6-NEXT: s_mov_b32 s4, s0 17; GFX6-NEXT: s_sub_i32 s0, s2, s3 18; GFX6-NEXT: s_mov_b32 s5, s1 19; GFX6-NEXT: v_mov_b32_e32 v0, s0 20; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 21; GFX6-NEXT: s_endpgm 22; 23; GFX8-LABEL: s_sub_i32: 24; GFX8: ; %bb.0: 25; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 26; GFX8-NEXT: s_waitcnt lgkmcnt(0) 27; GFX8-NEXT: s_sub_i32 s2, s2, s3 28; GFX8-NEXT: v_mov_b32_e32 v0, s0 29; GFX8-NEXT: v_mov_b32_e32 v1, s1 30; GFX8-NEXT: v_mov_b32_e32 v2, s2 31; GFX8-NEXT: flat_store_dword v[0:1], v2 32; GFX8-NEXT: s_endpgm 33; 34; GFX9-LABEL: s_sub_i32: 35; GFX9: ; %bb.0: 36; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 37; GFX9-NEXT: v_mov_b32_e32 v0, 0 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: s_sub_i32 s2, s2, s3 40; GFX9-NEXT: v_mov_b32_e32 v1, s2 41; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 42; GFX9-NEXT: s_endpgm 43; 44; GFX12-LABEL: s_sub_i32: 45; GFX12: ; %bb.0: 46; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 47; GFX12-NEXT: s_wait_kmcnt 0x0 48; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 49; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 50; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 51; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 52; GFX12-NEXT: s_endpgm 53 %result = sub i32 %a, %b 54 store i32 %result, ptr addrspace(1) %out 55 ret void 56} 57 58define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { 59; GFX6-LABEL: s_sub_imm_i32: 60; GFX6: ; %bb.0: 61; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb 62; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 63; GFX6-NEXT: s_mov_b32 s3, 0xf000 64; GFX6-NEXT: s_mov_b32 s2, -1 65; GFX6-NEXT: s_waitcnt lgkmcnt(0) 66; GFX6-NEXT: s_sub_i32 s4, 0x4d2, s6 67; GFX6-NEXT: v_mov_b32_e32 v0, s4 68; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 69; GFX6-NEXT: s_endpgm 70; 71; GFX8-LABEL: s_sub_imm_i32: 72; GFX8: ; %bb.0: 73; GFX8-NEXT: s_load_dword s2, s[4:5], 0x2c 74; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 75; GFX8-NEXT: s_waitcnt lgkmcnt(0) 76; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2 77; GFX8-NEXT: v_mov_b32_e32 v0, s0 78; GFX8-NEXT: v_mov_b32_e32 v1, s1 79; GFX8-NEXT: v_mov_b32_e32 v2, s2 80; GFX8-NEXT: flat_store_dword v[0:1], v2 81; GFX8-NEXT: s_endpgm 82; 83; GFX9-LABEL: s_sub_imm_i32: 84; GFX9: ; %bb.0: 85; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c 86; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 87; GFX9-NEXT: v_mov_b32_e32 v0, 0 88; GFX9-NEXT: s_waitcnt lgkmcnt(0) 89; GFX9-NEXT: s_sub_i32 s2, 0x4d2, s2 90; GFX9-NEXT: v_mov_b32_e32 v1, s2 91; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 92; GFX9-NEXT: s_endpgm 93; 94; GFX12-LABEL: s_sub_imm_i32: 95; GFX12: ; %bb.0: 96; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 97; GFX12-NEXT: s_wait_kmcnt 0x0 98; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 99; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 100; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 101; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 102; GFX12-NEXT: s_endpgm 103 %result = sub i32 1234, %a 104 store i32 %result, ptr addrspace(1) %out 105 ret void 106} 107 108define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 109; GFX6-LABEL: test_sub_i32: 110; GFX6: ; %bb.0: 111; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 112; GFX6-NEXT: s_mov_b32 s7, 0xf000 113; GFX6-NEXT: s_mov_b32 s6, -1 114; GFX6-NEXT: s_mov_b32 s10, s6 115; GFX6-NEXT: s_mov_b32 s11, s7 116; GFX6-NEXT: s_waitcnt lgkmcnt(0) 117; GFX6-NEXT: s_mov_b32 s8, s2 118; GFX6-NEXT: s_mov_b32 s9, s3 119; GFX6-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 120; GFX6-NEXT: s_mov_b32 s4, s0 121; GFX6-NEXT: s_mov_b32 s5, s1 122; GFX6-NEXT: s_waitcnt vmcnt(0) 123; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 124; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 125; GFX6-NEXT: s_endpgm 126; 127; GFX8-LABEL: test_sub_i32: 128; GFX8: ; %bb.0: 129; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 130; GFX8-NEXT: s_waitcnt lgkmcnt(0) 131; GFX8-NEXT: v_mov_b32_e32 v0, s2 132; GFX8-NEXT: v_mov_b32_e32 v1, s3 133; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 134; GFX8-NEXT: v_mov_b32_e32 v2, s0 135; GFX8-NEXT: v_mov_b32_e32 v3, s1 136; GFX8-NEXT: s_waitcnt vmcnt(0) 137; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 138; GFX8-NEXT: flat_store_dword v[2:3], v0 139; GFX8-NEXT: s_endpgm 140; 141; GFX9-LABEL: test_sub_i32: 142; GFX9: ; %bb.0: 143; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 144; GFX9-NEXT: v_mov_b32_e32 v2, 0 145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 146; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 147; GFX9-NEXT: s_waitcnt vmcnt(0) 148; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 149; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 150; GFX9-NEXT: s_endpgm 151; 152; GFX12-LABEL: test_sub_i32: 153; GFX12: ; %bb.0: 154; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 155; GFX12-NEXT: v_mov_b32_e32 v2, 0 156; GFX12-NEXT: s_wait_kmcnt 0x0 157; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] 158; GFX12-NEXT: s_wait_loadcnt 0x0 159; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1 160; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] 161; GFX12-NEXT: s_endpgm 162 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 163 %a = load i32, ptr addrspace(1) %in 164 %b = load i32, ptr addrspace(1) %b_ptr 165 %result = sub i32 %a, %b 166 store i32 %result, ptr addrspace(1) %out 167 ret void 168} 169 170define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 171; GFX6-LABEL: test_sub_imm_i32: 172; GFX6: ; %bb.0: 173; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 174; GFX6-NEXT: s_mov_b32 s7, 0xf000 175; GFX6-NEXT: s_mov_b32 s6, -1 176; GFX6-NEXT: s_mov_b32 s10, s6 177; GFX6-NEXT: s_mov_b32 s11, s7 178; GFX6-NEXT: s_waitcnt lgkmcnt(0) 179; GFX6-NEXT: s_mov_b32 s8, s2 180; GFX6-NEXT: s_mov_b32 s9, s3 181; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 182; GFX6-NEXT: s_mov_b32 s4, s0 183; GFX6-NEXT: s_mov_b32 s5, s1 184; GFX6-NEXT: s_waitcnt vmcnt(0) 185; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 0x7b, v0 186; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 187; GFX6-NEXT: s_endpgm 188; 189; GFX8-LABEL: test_sub_imm_i32: 190; GFX8: ; %bb.0: 191; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 192; GFX8-NEXT: s_waitcnt lgkmcnt(0) 193; GFX8-NEXT: v_mov_b32_e32 v0, s2 194; GFX8-NEXT: v_mov_b32_e32 v1, s3 195; GFX8-NEXT: flat_load_dword v2, v[0:1] 196; GFX8-NEXT: v_mov_b32_e32 v0, s0 197; GFX8-NEXT: v_mov_b32_e32 v1, s1 198; GFX8-NEXT: s_waitcnt vmcnt(0) 199; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2 200; GFX8-NEXT: flat_store_dword v[0:1], v2 201; GFX8-NEXT: s_endpgm 202; 203; GFX9-LABEL: test_sub_imm_i32: 204; GFX9: ; %bb.0: 205; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 206; GFX9-NEXT: v_mov_b32_e32 v0, 0 207; GFX9-NEXT: s_waitcnt lgkmcnt(0) 208; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 209; GFX9-NEXT: s_waitcnt vmcnt(0) 210; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1 211; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 212; GFX9-NEXT: s_endpgm 213; 214; GFX12-LABEL: test_sub_imm_i32: 215; GFX12: ; %bb.0: 216; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 217; GFX12-NEXT: v_mov_b32_e32 v0, 0 218; GFX12-NEXT: s_wait_kmcnt 0x0 219; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 220; GFX12-NEXT: s_wait_loadcnt 0x0 221; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1 222; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 223; GFX12-NEXT: s_endpgm 224 %a = load i32, ptr addrspace(1) %in 225 %result = sub i32 123, %a 226 store i32 %result, ptr addrspace(1) %out 227 ret void 228} 229 230define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 231; GFX6-LABEL: test_sub_v2i32: 232; GFX6: ; %bb.0: 233; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 234; GFX6-NEXT: s_mov_b32 s7, 0xf000 235; GFX6-NEXT: s_mov_b32 s6, -1 236; GFX6-NEXT: s_mov_b32 s10, s6 237; GFX6-NEXT: s_mov_b32 s11, s7 238; GFX6-NEXT: s_waitcnt lgkmcnt(0) 239; GFX6-NEXT: s_mov_b32 s8, s2 240; GFX6-NEXT: s_mov_b32 s9, s3 241; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 242; GFX6-NEXT: s_mov_b32 s4, s0 243; GFX6-NEXT: s_mov_b32 s5, s1 244; GFX6-NEXT: s_waitcnt vmcnt(0) 245; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 246; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 247; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 248; GFX6-NEXT: s_endpgm 249; 250; GFX8-LABEL: test_sub_v2i32: 251; GFX8: ; %bb.0: 252; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 253; GFX8-NEXT: s_waitcnt lgkmcnt(0) 254; GFX8-NEXT: v_mov_b32_e32 v0, s2 255; GFX8-NEXT: v_mov_b32_e32 v1, s3 256; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 257; GFX8-NEXT: v_mov_b32_e32 v4, s0 258; GFX8-NEXT: v_mov_b32_e32 v5, s1 259; GFX8-NEXT: s_waitcnt vmcnt(0) 260; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 261; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 262; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 263; GFX8-NEXT: s_endpgm 264; 265; GFX9-LABEL: test_sub_v2i32: 266; GFX9: ; %bb.0: 267; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 268; GFX9-NEXT: v_mov_b32_e32 v4, 0 269; GFX9-NEXT: s_waitcnt lgkmcnt(0) 270; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 271; GFX9-NEXT: s_waitcnt vmcnt(0) 272; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3 273; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 274; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 275; GFX9-NEXT: s_endpgm 276; 277; GFX12-LABEL: test_sub_v2i32: 278; GFX12: ; %bb.0: 279; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 280; GFX12-NEXT: v_mov_b32_e32 v4, 0 281; GFX12-NEXT: s_wait_kmcnt 0x0 282; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] 283; GFX12-NEXT: s_wait_loadcnt 0x0 284; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3 285; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2 286; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] 287; GFX12-NEXT: s_endpgm 288 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 289 %a = load <2 x i32>, ptr addrspace(1) %in 290 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 291 %result = sub <2 x i32> %a, %b 292 store <2 x i32> %result, ptr addrspace(1) %out 293 ret void 294} 295 296define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 297; GFX6-LABEL: test_sub_v4i32: 298; GFX6: ; %bb.0: 299; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 300; GFX6-NEXT: s_mov_b32 s7, 0xf000 301; GFX6-NEXT: s_mov_b32 s6, -1 302; GFX6-NEXT: s_mov_b32 s10, s6 303; GFX6-NEXT: s_mov_b32 s11, s7 304; GFX6-NEXT: s_waitcnt lgkmcnt(0) 305; GFX6-NEXT: s_mov_b32 s8, s2 306; GFX6-NEXT: s_mov_b32 s9, s3 307; GFX6-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 308; GFX6-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 309; GFX6-NEXT: s_mov_b32 s4, s0 310; GFX6-NEXT: s_mov_b32 s5, s1 311; GFX6-NEXT: s_waitcnt vmcnt(0) 312; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 313; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 314; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 315; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 316; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 317; GFX6-NEXT: s_endpgm 318; 319; GFX8-LABEL: test_sub_v4i32: 320; GFX8: ; %bb.0: 321; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 322; GFX8-NEXT: s_waitcnt lgkmcnt(0) 323; GFX8-NEXT: v_mov_b32_e32 v0, s2 324; GFX8-NEXT: v_mov_b32_e32 v1, s3 325; GFX8-NEXT: s_add_u32 s2, s2, 16 326; GFX8-NEXT: s_addc_u32 s3, s3, 0 327; GFX8-NEXT: v_mov_b32_e32 v5, s3 328; GFX8-NEXT: v_mov_b32_e32 v4, s2 329; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 330; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 331; GFX8-NEXT: v_mov_b32_e32 v8, s0 332; GFX8-NEXT: v_mov_b32_e32 v9, s1 333; GFX8-NEXT: s_waitcnt vmcnt(0) 334; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 335; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 336; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 337; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 338; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 339; GFX8-NEXT: s_endpgm 340; 341; GFX9-LABEL: test_sub_v4i32: 342; GFX9: ; %bb.0: 343; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 344; GFX9-NEXT: v_mov_b32_e32 v8, 0 345; GFX9-NEXT: s_waitcnt lgkmcnt(0) 346; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 347; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] 348; GFX9-NEXT: s_waitcnt vmcnt(0) 349; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3 350; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2 351; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1 352; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0 353; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 354; GFX9-NEXT: s_endpgm 355; 356; GFX12-LABEL: test_sub_v4i32: 357; GFX12: ; %bb.0: 358; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 359; GFX12-NEXT: v_mov_b32_e32 v8, 0 360; GFX12-NEXT: s_wait_kmcnt 0x0 361; GFX12-NEXT: s_clause 0x1 362; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16 363; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3] 364; GFX12-NEXT: s_wait_loadcnt 0x0 365; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 366; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2 367; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1 368; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0 369; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] 370; GFX12-NEXT: s_endpgm 371 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 372 %a = load <4 x i32>, ptr addrspace(1) %in 373 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 374 %result = sub <4 x i32> %a, %b 375 store <4 x i32> %result, ptr addrspace(1) %out 376 ret void 377} 378 379define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 380; GFX6-LABEL: test_sub_i16: 381; GFX6: ; %bb.0: 382; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 383; GFX6-NEXT: s_mov_b32 s7, 0xf000 384; GFX6-NEXT: s_mov_b32 s10, 0 385; GFX6-NEXT: s_mov_b32 s11, s7 386; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 387; GFX6-NEXT: s_waitcnt lgkmcnt(0) 388; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] 389; GFX6-NEXT: v_mov_b32_e32 v1, 0 390; GFX6-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc 391; GFX6-NEXT: s_waitcnt vmcnt(0) 392; GFX6-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 offset:2 glc 393; GFX6-NEXT: s_waitcnt vmcnt(0) 394; GFX6-NEXT: s_mov_b32 s6, -1 395; GFX6-NEXT: s_mov_b32 s4, s0 396; GFX6-NEXT: s_mov_b32 s5, s1 397; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 398; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 399; GFX6-NEXT: s_endpgm 400; 401; GFX8-LABEL: test_sub_i16: 402; GFX8: ; %bb.0: 403; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 404; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 405; GFX8-NEXT: s_waitcnt lgkmcnt(0) 406; GFX8-NEXT: v_mov_b32_e32 v1, s3 407; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 408; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 409; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 410; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 411; GFX8-NEXT: flat_load_ushort v4, v[0:1] glc 412; GFX8-NEXT: s_waitcnt vmcnt(0) 413; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc 414; GFX8-NEXT: s_waitcnt vmcnt(0) 415; GFX8-NEXT: v_mov_b32_e32 v0, s0 416; GFX8-NEXT: v_mov_b32_e32 v1, s1 417; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2 418; GFX8-NEXT: flat_store_short v[0:1], v2 419; GFX8-NEXT: s_endpgm 420; 421; GFX9-LABEL: test_sub_i16: 422; GFX9: ; %bb.0: 423; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 424; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 425; GFX9-NEXT: s_waitcnt lgkmcnt(0) 426; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 427; GFX9-NEXT: s_waitcnt vmcnt(0) 428; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc 429; GFX9-NEXT: s_waitcnt vmcnt(0) 430; GFX9-NEXT: v_mov_b32_e32 v0, 0 431; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2 432; GFX9-NEXT: global_store_short v0, v1, s[0:1] 433; GFX9-NEXT: s_endpgm 434; 435; GFX12-LABEL: test_sub_i16: 436; GFX12: ; %bb.0: 437; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 438; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 439; GFX12-NEXT: v_mov_b32_e32 v2, 0 440; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 441; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 442; GFX12-NEXT: s_wait_kmcnt 0x0 443; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS 444; GFX12-NEXT: s_wait_loadcnt 0x0 445; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS 446; GFX12-NEXT: s_wait_loadcnt 0x0 447; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0 448; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] 449; GFX12-NEXT: s_endpgm 450 %tid = call i32 @llvm.amdgcn.workitem.id.x() 451 %gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 452 %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i32 1 453 %a = load volatile i16, ptr addrspace(1) %gep 454 %b = load volatile i16, ptr addrspace(1) %b_ptr 455 %result = sub i16 %a, %b 456 store i16 %result, ptr addrspace(1) %out 457 ret void 458} 459 460define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 461; GFX6-LABEL: test_sub_v2i16: 462; GFX6: ; %bb.0: 463; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 464; GFX6-NEXT: s_mov_b32 s7, 0xf000 465; GFX6-NEXT: s_mov_b32 s10, 0 466; GFX6-NEXT: s_mov_b32 s11, s7 467; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 468; GFX6-NEXT: s_waitcnt lgkmcnt(0) 469; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] 470; GFX6-NEXT: v_mov_b32_e32 v1, 0 471; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 472; GFX6-NEXT: s_mov_b32 s6, -1 473; GFX6-NEXT: s_mov_b32 s4, s0 474; GFX6-NEXT: s_mov_b32 s5, s1 475; GFX6-NEXT: s_waitcnt vmcnt(0) 476; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 477; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 478; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 479; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 480; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 481; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 482; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 483; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 484; GFX6-NEXT: s_endpgm 485; 486; GFX8-LABEL: test_sub_v2i16: 487; GFX8: ; %bb.0: 488; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 489; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 490; GFX8-NEXT: s_waitcnt lgkmcnt(0) 491; GFX8-NEXT: v_mov_b32_e32 v1, s3 492; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 493; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 494; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 495; GFX8-NEXT: v_mov_b32_e32 v2, s0 496; GFX8-NEXT: v_mov_b32_e32 v3, s1 497; GFX8-NEXT: s_waitcnt vmcnt(0) 498; GFX8-NEXT: v_sub_u16_e32 v4, v0, v1 499; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 500; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 501; GFX8-NEXT: flat_store_dword v[2:3], v0 502; GFX8-NEXT: s_endpgm 503; 504; GFX9-LABEL: test_sub_v2i16: 505; GFX9: ; %bb.0: 506; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 507; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 508; GFX9-NEXT: v_mov_b32_e32 v2, 0 509; GFX9-NEXT: s_waitcnt lgkmcnt(0) 510; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 511; GFX9-NEXT: s_waitcnt vmcnt(0) 512; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 513; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 514; GFX9-NEXT: s_endpgm 515; 516; GFX12-LABEL: test_sub_v2i16: 517; GFX12: ; %bb.0: 518; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 519; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 520; GFX12-NEXT: v_mov_b32_e32 v2, 0 521; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 522; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 523; GFX12-NEXT: s_wait_kmcnt 0x0 524; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] 525; GFX12-NEXT: s_wait_loadcnt 0x0 526; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1 527; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] 528; GFX12-NEXT: s_endpgm 529 %tid = call i32 @llvm.amdgcn.workitem.id.x() 530 %gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid 531 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 532 %a = load <2 x i16>, ptr addrspace(1) %gep 533 %b = load <2 x i16>, ptr addrspace(1) %b_ptr 534 %result = sub <2 x i16> %a, %b 535 store <2 x i16> %result, ptr addrspace(1) %out 536 ret void 537} 538 539define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 540; GFX6-LABEL: test_sub_v4i16: 541; GFX6: ; %bb.0: 542; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 543; GFX6-NEXT: s_mov_b32 s7, 0xf000 544; GFX6-NEXT: s_mov_b32 s10, 0 545; GFX6-NEXT: s_mov_b32 s11, s7 546; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 547; GFX6-NEXT: s_waitcnt lgkmcnt(0) 548; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] 549; GFX6-NEXT: v_mov_b32_e32 v1, 0 550; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 551; GFX6-NEXT: s_mov_b32 s6, -1 552; GFX6-NEXT: s_mov_b32 s4, s0 553; GFX6-NEXT: s_mov_b32 s5, s1 554; GFX6-NEXT: s_waitcnt vmcnt(0) 555; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v0 556; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v1 557; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v2 558; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v3 559; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 560; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 561; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v7 562; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v4, v6 563; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 564; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 565; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 566; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 567; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 568; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 569; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 570; GFX6-NEXT: s_endpgm 571; 572; GFX8-LABEL: test_sub_v4i16: 573; GFX8: ; %bb.0: 574; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 575; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 576; GFX8-NEXT: s_waitcnt lgkmcnt(0) 577; GFX8-NEXT: v_mov_b32_e32 v1, s3 578; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 579; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 580; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 581; GFX8-NEXT: v_mov_b32_e32 v4, s0 582; GFX8-NEXT: v_mov_b32_e32 v5, s1 583; GFX8-NEXT: s_waitcnt vmcnt(0) 584; GFX8-NEXT: v_sub_u16_e32 v6, v1, v3 585; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 586; GFX8-NEXT: v_sub_u16_e32 v3, v0, v2 587; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 588; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 589; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 590; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 591; GFX8-NEXT: s_endpgm 592; 593; GFX9-LABEL: test_sub_v4i16: 594; GFX9: ; %bb.0: 595; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 596; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 597; GFX9-NEXT: v_mov_b32_e32 v4, 0 598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 599; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 600; GFX9-NEXT: s_waitcnt vmcnt(0) 601; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 602; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 603; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 604; GFX9-NEXT: s_endpgm 605; 606; GFX12-LABEL: test_sub_v4i16: 607; GFX12: ; %bb.0: 608; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 609; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 610; GFX12-NEXT: v_mov_b32_e32 v4, 0 611; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 612; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 613; GFX12-NEXT: s_wait_kmcnt 0x0 614; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3] 615; GFX12-NEXT: s_wait_loadcnt 0x0 616; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3 617; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2 618; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] 619; GFX12-NEXT: s_endpgm 620 %tid = call i32 @llvm.amdgcn.workitem.id.x() 621 %gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid 622 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 623 %a = load <4 x i16>, ptr addrspace(1) %gep 624 %b = load <4 x i16>, ptr addrspace(1) %b_ptr 625 %result = sub <4 x i16> %a, %b 626 store <4 x i16> %result, ptr addrspace(1) %out 627 ret void 628} 629 630define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64 %b) nounwind { 631; GFX6-LABEL: s_sub_i64: 632; GFX6: ; %bb.0: 633; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 634; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 635; GFX6-NEXT: s_mov_b32 s7, 0xf000 636; GFX6-NEXT: s_mov_b32 s6, -1 637; GFX6-NEXT: s_waitcnt lgkmcnt(0) 638; GFX6-NEXT: s_sub_u32 s0, s0, s2 639; GFX6-NEXT: s_subb_u32 s1, s1, s3 640; GFX6-NEXT: v_mov_b32_e32 v0, s0 641; GFX6-NEXT: v_mov_b32_e32 v1, s1 642; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 643; GFX6-NEXT: s_endpgm 644; 645; GFX8-LABEL: s_sub_i64: 646; GFX8: ; %bb.0: 647; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 648; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 649; GFX8-NEXT: s_waitcnt lgkmcnt(0) 650; GFX8-NEXT: s_sub_u32 s0, s0, s2 651; GFX8-NEXT: s_subb_u32 s1, s1, s3 652; GFX8-NEXT: v_mov_b32_e32 v0, s4 653; GFX8-NEXT: v_mov_b32_e32 v3, s1 654; GFX8-NEXT: v_mov_b32_e32 v1, s5 655; GFX8-NEXT: v_mov_b32_e32 v2, s0 656; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 657; GFX8-NEXT: s_endpgm 658; 659; GFX9-LABEL: s_sub_i64: 660; GFX9: ; %bb.0: 661; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 662; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 663; GFX9-NEXT: v_mov_b32_e32 v2, 0 664; GFX9-NEXT: s_waitcnt lgkmcnt(0) 665; GFX9-NEXT: s_sub_u32 s0, s0, s2 666; GFX9-NEXT: s_subb_u32 s1, s1, s3 667; GFX9-NEXT: v_mov_b32_e32 v0, s0 668; GFX9-NEXT: v_mov_b32_e32 v1, s1 669; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 670; GFX9-NEXT: s_endpgm 671; 672; GFX12-LABEL: s_sub_i64: 673; GFX12: ; %bb.0: 674; GFX12-NEXT: s_clause 0x1 675; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 676; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 677; GFX12-NEXT: s_wait_kmcnt 0x0 678; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[0:1], s[2:3] 679; GFX12-NEXT: v_mov_b32_e32 v2, 0 680; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 681; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] 682; GFX12-NEXT: s_endpgm 683 %result = sub i64 %a, %b 684 store i64 %result, ptr addrspace(1) %out, align 8 685 ret void 686} 687 688define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) nounwind { 689; GFX6-LABEL: v_sub_i64: 690; GFX6: ; %bb.0: 691; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 692; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 693; GFX6-NEXT: s_mov_b32 s11, 0xf000 694; GFX6-NEXT: s_mov_b32 s14, 0 695; GFX6-NEXT: s_mov_b32 s15, s11 696; GFX6-NEXT: v_lshlrev_b32_e32 v0, 3, v0 697; GFX6-NEXT: v_mov_b32_e32 v1, 0 698; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] 699; GFX6-NEXT: s_waitcnt lgkmcnt(0) 700; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] 701; GFX6-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 702; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[12:15], 0 addr64 703; GFX6-NEXT: s_mov_b32 s10, -1 704; GFX6-NEXT: s_mov_b32 s8, s0 705; GFX6-NEXT: s_mov_b32 s9, s1 706; GFX6-NEXT: s_waitcnt vmcnt(0) 707; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 708; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 709; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 710; GFX6-NEXT: s_endpgm 711; 712; GFX8-LABEL: v_sub_i64: 713; GFX8: ; %bb.0: 714; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 715; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 716; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 717; GFX8-NEXT: s_waitcnt lgkmcnt(0) 718; GFX8-NEXT: v_mov_b32_e32 v1, s3 719; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 720; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 721; GFX8-NEXT: v_mov_b32_e32 v3, s5 722; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2 723; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 724; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 725; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 726; GFX8-NEXT: s_waitcnt vmcnt(0) 727; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 728; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 729; GFX8-NEXT: v_mov_b32_e32 v3, s1 730; GFX8-NEXT: v_mov_b32_e32 v2, s0 731; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 732; GFX8-NEXT: s_endpgm 733; 734; GFX9-LABEL: v_sub_i64: 735; GFX9: ; %bb.0: 736; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 737; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 738; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 739; GFX9-NEXT: s_waitcnt lgkmcnt(0) 740; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] 741; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] 742; GFX9-NEXT: v_mov_b32_e32 v4, 0 743; GFX9-NEXT: s_waitcnt vmcnt(0) 744; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 745; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 746; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 747; GFX9-NEXT: s_endpgm 748; 749; GFX12-LABEL: v_sub_i64: 750; GFX12: ; %bb.0: 751; GFX12-NEXT: s_clause 0x1 752; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 753; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 754; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 755; GFX12-NEXT: v_mov_b32_e32 v4, 0 756; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 757; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 758; GFX12-NEXT: s_wait_kmcnt 0x0 759; GFX12-NEXT: s_clause 0x1 760; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3] 761; GFX12-NEXT: global_load_b64 v[2:3], v2, s[4:5] 762; GFX12-NEXT: s_wait_loadcnt 0x0 763; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 764; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 765; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] 766; GFX12-NEXT: s_endpgm 767 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 768 %a_ptr = getelementptr i64, ptr addrspace(1) %inA, i32 %tid 769 %b_ptr = getelementptr i64, ptr addrspace(1) %inB, i32 %tid 770 %a = load i64, ptr addrspace(1) %a_ptr 771 %b = load i64, ptr addrspace(1) %b_ptr 772 %result = sub i64 %a, %b 773 store i64 %result, ptr addrspace(1) %out, align 8 774 ret void 775} 776 777define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { 778; GFX6-LABEL: v_test_sub_v2i64: 779; GFX6: ; %bb.0: 780; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 781; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 782; GFX6-NEXT: s_mov_b32 s11, 0xf000 783; GFX6-NEXT: s_mov_b32 s14, 0 784; GFX6-NEXT: s_mov_b32 s15, s11 785; GFX6-NEXT: v_lshlrev_b32_e32 v4, 4, v0 786; GFX6-NEXT: v_mov_b32_e32 v5, 0 787; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] 788; GFX6-NEXT: s_waitcnt lgkmcnt(0) 789; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] 790; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 791; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[12:15], 0 addr64 792; GFX6-NEXT: s_mov_b32 s10, -1 793; GFX6-NEXT: s_mov_b32 s8, s0 794; GFX6-NEXT: s_mov_b32 s9, s1 795; GFX6-NEXT: s_waitcnt vmcnt(0) 796; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 797; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 798; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 799; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 800; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 801; GFX6-NEXT: s_endpgm 802; 803; GFX8-LABEL: v_test_sub_v2i64: 804; GFX8: ; %bb.0: 805; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 806; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 807; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0 808; GFX8-NEXT: s_waitcnt lgkmcnt(0) 809; GFX8-NEXT: v_mov_b32_e32 v1, s3 810; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 811; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 812; GFX8-NEXT: v_mov_b32_e32 v3, s5 813; GFX8-NEXT: v_add_u32_e32 v4, vcc, s4, v2 814; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc 815; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 816; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 817; GFX8-NEXT: s_waitcnt vmcnt(0) 818; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 819; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc 820; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 821; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc 822; GFX8-NEXT: v_mov_b32_e32 v5, s1 823; GFX8-NEXT: v_mov_b32_e32 v4, s0 824; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 825; GFX8-NEXT: s_endpgm 826; 827; GFX9-LABEL: v_test_sub_v2i64: 828; GFX9: ; %bb.0: 829; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 830; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 831; GFX9-NEXT: v_lshlrev_b32_e32 v8, 4, v0 832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 833; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] 834; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7] 835; GFX9-NEXT: v_mov_b32_e32 v8, 0 836; GFX9-NEXT: s_waitcnt vmcnt(0) 837; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 838; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc 839; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 840; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 841; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] 842; GFX9-NEXT: s_endpgm 843; 844; GFX12-LABEL: v_test_sub_v2i64: 845; GFX12: ; %bb.0: 846; GFX12-NEXT: s_clause 0x1 847; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 848; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 849; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 850; GFX12-NEXT: v_mov_b32_e32 v8, 0 851; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 852; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 853; GFX12-NEXT: s_wait_kmcnt 0x0 854; GFX12-NEXT: s_clause 0x1 855; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3] 856; GFX12-NEXT: global_load_b128 v[4:7], v4, s[4:5] 857; GFX12-NEXT: s_wait_loadcnt 0x0 858; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 859; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo 860; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 861; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 862; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] 863; GFX12-NEXT: s_endpgm 864 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 865 %a_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inA, i32 %tid 866 %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %inB, i32 %tid 867 %a = load <2 x i64>, ptr addrspace(1) %a_ptr 868 %b = load <2 x i64>, ptr addrspace(1) %b_ptr 869 %result = sub <2 x i64> %a, %b 870 store <2 x i64> %result, ptr addrspace(1) %out 871 ret void 872} 873 874define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) { 875; GFX6-LABEL: v_test_sub_v4i64: 876; GFX6: ; %bb.0: 877; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 878; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 879; GFX6-NEXT: s_mov_b32 s11, 0xf000 880; GFX6-NEXT: s_mov_b32 s14, 0 881; GFX6-NEXT: s_mov_b32 s15, s11 882; GFX6-NEXT: s_waitcnt lgkmcnt(0) 883; GFX6-NEXT: s_mov_b64 s[12:13], s[2:3] 884; GFX6-NEXT: v_lshlrev_b32_e32 v12, 5, v0 885; GFX6-NEXT: v_mov_b32_e32 v13, 0 886; GFX6-NEXT: s_mov_b64 s[6:7], s[14:15] 887; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[12:13], s[12:15], 0 addr64 888; GFX6-NEXT: buffer_load_dwordx4 v[4:7], v[12:13], s[4:7], 0 addr64 889; GFX6-NEXT: buffer_load_dwordx4 v[8:11], v[12:13], s[4:7], 0 addr64 offset:16 890; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[12:13], s[12:15], 0 addr64 offset:16 891; GFX6-NEXT: s_mov_b32 s10, -1 892; GFX6-NEXT: s_mov_b32 s8, s0 893; GFX6-NEXT: s_mov_b32 s9, s1 894; GFX6-NEXT: s_waitcnt vmcnt(2) 895; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 896; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc 897; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 898; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc 899; GFX6-NEXT: s_waitcnt vmcnt(0) 900; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 901; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v15, v11, vcc 902; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 903; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v13, v9, vcc 904; GFX6-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 905; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 906; GFX6-NEXT: s_endpgm 907; 908; GFX8-LABEL: v_test_sub_v4i64: 909; GFX8: ; %bb.0: 910; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 911; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 912; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0 913; GFX8-NEXT: s_waitcnt lgkmcnt(0) 914; GFX8-NEXT: v_mov_b32_e32 v1, s3 915; GFX8-NEXT: v_add_u32_e32 v8, vcc, s2, v0 916; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc 917; GFX8-NEXT: v_mov_b32_e32 v1, s5 918; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v0 919; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc 920; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9] 921; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13] 922; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v8 923; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc 924; GFX8-NEXT: v_add_u32_e32 v12, vcc, 16, v12 925; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc 926; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 927; GFX8-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 928; GFX8-NEXT: v_mov_b32_e32 v17, s1 929; GFX8-NEXT: v_mov_b32_e32 v16, s0 930; GFX8-NEXT: s_add_u32 s0, s0, 16 931; GFX8-NEXT: s_addc_u32 s1, s1, 0 932; GFX8-NEXT: s_waitcnt vmcnt(2) 933; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 934; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc 935; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 936; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc 937; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 938; GFX8-NEXT: s_waitcnt vmcnt(1) 939; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v10, v14 940; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v11, v15, vcc 941; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v8, v12 942; GFX8-NEXT: v_mov_b32_e32 v0, s0 943; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v9, v13, vcc 944; GFX8-NEXT: v_mov_b32_e32 v1, s1 945; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] 946; GFX8-NEXT: s_endpgm 947; 948; GFX9-LABEL: v_test_sub_v4i64: 949; GFX9: ; %bb.0: 950; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 951; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 952; GFX9-NEXT: v_lshlrev_b32_e32 v16, 5, v0 953; GFX9-NEXT: s_waitcnt lgkmcnt(0) 954; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] 955; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] 956; GFX9-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:16 957; GFX9-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:16 958; GFX9-NEXT: v_mov_b32_e32 v16, 0 959; GFX9-NEXT: s_waitcnt vmcnt(2) 960; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 961; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc 962; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 963; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 964; GFX9-NEXT: s_waitcnt vmcnt(0) 965; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v10, v14 966; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v11, v15, vcc 967; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v8, v12 968; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v13, vcc 969; GFX9-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 970; GFX9-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] 971; GFX9-NEXT: s_endpgm 972; 973; GFX12-LABEL: v_test_sub_v4i64: 974; GFX12: ; %bb.0: 975; GFX12-NEXT: s_clause 0x1 976; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 977; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 978; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 979; GFX12-NEXT: v_mov_b32_e32 v16, 0 980; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 981; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0 982; GFX12-NEXT: s_wait_kmcnt 0x0 983; GFX12-NEXT: s_clause 0x3 984; GFX12-NEXT: global_load_b128 v[0:3], v12, s[2:3] 985; GFX12-NEXT: global_load_b128 v[4:7], v12, s[4:5] 986; GFX12-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:16 987; GFX12-NEXT: global_load_b128 v[12:15], v12, s[4:5] offset:16 988; GFX12-NEXT: s_wait_loadcnt 0x2 989; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6 990; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo 991; GFX12-NEXT: s_wait_loadcnt 0x0 992; GFX12-NEXT: v_sub_co_u32 v10, vcc_lo, v10, v14 993; GFX12-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v11, v15, vcc_lo 994; GFX12-NEXT: v_sub_co_u32 v8, vcc_lo, v8, v12 995; GFX12-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v9, v13, vcc_lo 996; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 997; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 998; GFX12-NEXT: s_clause 0x1 999; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 1000; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] 1001; GFX12-NEXT: s_endpgm 1002 %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone 1003 %a_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inA, i32 %tid 1004 %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %inB, i32 %tid 1005 %a = load <4 x i64>, ptr addrspace(1) %a_ptr 1006 %b = load <4 x i64>, ptr addrspace(1) %b_ptr 1007 %result = sub <4 x i64> %a, %b 1008 store <4 x i64> %result, ptr addrspace(1) %out 1009 ret void 1010} 1011 1012; Make sure the VOP3 form of sub is initially selected. Otherwise pair 1013; of opies from/to VCC would be necessary 1014 1015define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) { 1016; GFX6-LABEL: sub_select_vop3: 1017; GFX6: ; %bb.0: 1018; GFX6-NEXT: v_subrev_i32_e64 v0, s[0:1], s0, v0 1019; GFX6-NEXT: s_mov_b32 m0, -1 1020; GFX6-NEXT: ;;#ASMSTART 1021; GFX6-NEXT: ; def vcc 1022; GFX6-NEXT: ;;#ASMEND 1023; GFX6-NEXT: ds_write_b32 v0, v0 1024; GFX6-NEXT: ;;#ASMSTART 1025; GFX6-NEXT: ; use vcc 1026; GFX6-NEXT: ;;#ASMEND 1027; GFX6-NEXT: s_endpgm 1028; 1029; GFX8-LABEL: sub_select_vop3: 1030; GFX8: ; %bb.0: 1031; GFX8-NEXT: v_subrev_u32_e64 v0, s[0:1], s0, v0 1032; GFX8-NEXT: s_mov_b32 m0, -1 1033; GFX8-NEXT: ;;#ASMSTART 1034; GFX8-NEXT: ; def vcc 1035; GFX8-NEXT: ;;#ASMEND 1036; GFX8-NEXT: ds_write_b32 v0, v0 1037; GFX8-NEXT: ;;#ASMSTART 1038; GFX8-NEXT: ; use vcc 1039; GFX8-NEXT: ;;#ASMEND 1040; GFX8-NEXT: s_endpgm 1041; 1042; GFX9-LABEL: sub_select_vop3: 1043; GFX9: ; %bb.0: 1044; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 1045; GFX9-NEXT: ;;#ASMSTART 1046; GFX9-NEXT: ; def vcc 1047; GFX9-NEXT: ;;#ASMEND 1048; GFX9-NEXT: ds_write_b32 v0, v0 1049; GFX9-NEXT: ;;#ASMSTART 1050; GFX9-NEXT: ; use vcc 1051; GFX9-NEXT: ;;#ASMEND 1052; GFX9-NEXT: s_endpgm 1053; 1054; GFX12-LABEL: sub_select_vop3: 1055; GFX12: ; %bb.0: 1056; GFX12-NEXT: v_subrev_nc_u32_e32 v0, s0, v0 1057; GFX12-NEXT: ;;#ASMSTART 1058; GFX12-NEXT: ; def vcc 1059; GFX12-NEXT: ;;#ASMEND 1060; GFX12-NEXT: ds_store_b32 v0, v0 1061; GFX12-NEXT: ;;#ASMSTART 1062; GFX12-NEXT: ; use vcc 1063; GFX12-NEXT: ;;#ASMEND 1064; GFX12-NEXT: s_endpgm 1065 %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() 1066 %sub = sub i32 %v, %s 1067 store i32 %sub, ptr addrspace(3) undef 1068 call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc) 1069 ret void 1070} 1071