1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s 8 9define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 10; GFX6-LABEL: s_add_i32: 11; GFX6: ; %bb.0: 12; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 13; GFX6-NEXT: s_waitcnt lgkmcnt(0) 14; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 15; GFX6-NEXT: s_mov_b32 s3, 0xf000 16; GFX6-NEXT: s_mov_b32 s2, -1 17; GFX6-NEXT: s_waitcnt lgkmcnt(0) 18; GFX6-NEXT: s_add_i32 s4, s4, s5 19; GFX6-NEXT: v_mov_b32_e32 v0, s4 20; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 21; GFX6-NEXT: s_endpgm 22; 23; GFX8-LABEL: s_add_i32: 24; GFX8: ; %bb.0: 25; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 26; GFX8-NEXT: s_waitcnt lgkmcnt(0) 27; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 28; GFX8-NEXT: v_mov_b32_e32 v0, s0 29; GFX8-NEXT: v_mov_b32_e32 v1, s1 30; GFX8-NEXT: s_waitcnt lgkmcnt(0) 31; GFX8-NEXT: s_add_i32 s0, s2, s3 32; GFX8-NEXT: v_mov_b32_e32 v2, s0 33; GFX8-NEXT: flat_store_dword v[0:1], v2 34; GFX8-NEXT: s_endpgm 35; 36; GFX9-LABEL: s_add_i32: 37; GFX9: ; %bb.0: 38; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 39; GFX9-NEXT: v_mov_b32_e32 v0, 0 40; GFX9-NEXT: s_waitcnt lgkmcnt(0) 41; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 42; GFX9-NEXT: s_waitcnt lgkmcnt(0) 43; GFX9-NEXT: s_add_i32 s2, s4, s5 44; GFX9-NEXT: v_mov_b32_e32 v1, s2 45; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 46; GFX9-NEXT: s_endpgm 47; 48; GFX10-LABEL: s_add_i32: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 51; GFX10-NEXT: v_mov_b32_e32 v0, 0 52; GFX10-NEXT: s_waitcnt lgkmcnt(0) 53; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 54; GFX10-NEXT: s_waitcnt lgkmcnt(0) 55; GFX10-NEXT: s_add_i32 s2, s4, s5 56; GFX10-NEXT: v_mov_b32_e32 v1, s2 57; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 58; GFX10-NEXT: s_endpgm 59; 60; GFX11-LABEL: s_add_i32: 61; GFX11: ; %bb.0: 62; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 63; GFX11-NEXT: s_waitcnt lgkmcnt(0) 64; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 65; GFX11-NEXT: s_waitcnt lgkmcnt(0) 66; GFX11-NEXT: s_add_i32 s2, s2, s3 67; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 68; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 69; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 70; GFX11-NEXT: s_endpgm 71; 72; GFX12-LABEL: s_add_i32: 73; GFX12: ; %bb.0: 74; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 75; GFX12-NEXT: s_wait_kmcnt 0x0 76; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 77; GFX12-NEXT: s_wait_kmcnt 0x0 78; GFX12-NEXT: s_add_co_i32 s2, s2, s3 79; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 80; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 81; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 82; GFX12-NEXT: s_endpgm 83 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 84 %a = load i32, ptr addrspace(1) %in 85 %b = load i32, ptr addrspace(1) %b_ptr 86 %result = add i32 %a, %b 87 store i32 %result, ptr addrspace(1) %out 88 ret void 89} 90 91define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 92; GFX6-LABEL: s_add_v2i32: 93; GFX6: ; %bb.0: 94; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 95; GFX6-NEXT: s_waitcnt lgkmcnt(0) 96; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 97; GFX6-NEXT: s_mov_b32 s3, 0xf000 98; GFX6-NEXT: s_mov_b32 s2, -1 99; GFX6-NEXT: s_waitcnt lgkmcnt(0) 100; GFX6-NEXT: s_add_i32 s5, s5, s7 101; GFX6-NEXT: s_add_i32 s4, s4, s6 102; GFX6-NEXT: v_mov_b32_e32 v0, s4 103; GFX6-NEXT: v_mov_b32_e32 v1, s5 104; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 105; GFX6-NEXT: s_endpgm 106; 107; GFX8-LABEL: s_add_v2i32: 108; GFX8: ; %bb.0: 109; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 110; GFX8-NEXT: s_waitcnt lgkmcnt(0) 111; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 112; GFX8-NEXT: v_mov_b32_e32 v0, s0 113; GFX8-NEXT: v_mov_b32_e32 v1, s1 114; GFX8-NEXT: s_waitcnt lgkmcnt(0) 115; GFX8-NEXT: s_add_i32 s0, s5, s7 116; GFX8-NEXT: s_add_i32 s1, s4, s6 117; GFX8-NEXT: v_mov_b32_e32 v2, s1 118; GFX8-NEXT: v_mov_b32_e32 v3, s0 119; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 120; GFX8-NEXT: s_endpgm 121; 122; GFX9-LABEL: s_add_v2i32: 123; GFX9: ; %bb.0: 124; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 125; GFX9-NEXT: v_mov_b32_e32 v2, 0 126; GFX9-NEXT: s_waitcnt lgkmcnt(0) 127; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 128; GFX9-NEXT: s_waitcnt lgkmcnt(0) 129; GFX9-NEXT: s_add_i32 s2, s5, s7 130; GFX9-NEXT: s_add_i32 s3, s4, s6 131; GFX9-NEXT: v_mov_b32_e32 v0, s3 132; GFX9-NEXT: v_mov_b32_e32 v1, s2 133; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 134; GFX9-NEXT: s_endpgm 135; 136; GFX10-LABEL: s_add_v2i32: 137; GFX10: ; %bb.0: 138; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 139; GFX10-NEXT: v_mov_b32_e32 v2, 0 140; GFX10-NEXT: s_waitcnt lgkmcnt(0) 141; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 142; GFX10-NEXT: s_waitcnt lgkmcnt(0) 143; GFX10-NEXT: s_add_i32 s2, s4, s6 144; GFX10-NEXT: s_add_i32 s3, s5, s7 145; GFX10-NEXT: v_mov_b32_e32 v0, s2 146; GFX10-NEXT: v_mov_b32_e32 v1, s3 147; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 148; GFX10-NEXT: s_endpgm 149; 150; GFX11-LABEL: s_add_v2i32: 151; GFX11: ; %bb.0: 152; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 153; GFX11-NEXT: s_waitcnt lgkmcnt(0) 154; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 155; GFX11-NEXT: s_waitcnt lgkmcnt(0) 156; GFX11-NEXT: s_add_i32 s2, s4, s6 157; GFX11-NEXT: s_add_i32 s3, s5, s7 158; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 159; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 160; GFX11-NEXT: v_mov_b32_e32 v0, s2 161; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 162; GFX11-NEXT: s_endpgm 163; 164; GFX12-LABEL: s_add_v2i32: 165; GFX12: ; %bb.0: 166; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 167; GFX12-NEXT: s_wait_kmcnt 0x0 168; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 169; GFX12-NEXT: s_wait_kmcnt 0x0 170; GFX12-NEXT: s_add_co_i32 s2, s4, s6 171; GFX12-NEXT: s_add_co_i32 s3, s5, s7 172; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 173; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 174; GFX12-NEXT: v_mov_b32_e32 v0, s2 175; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 176; GFX12-NEXT: s_endpgm 177 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 178 %a = load <2 x i32>, ptr addrspace(1) %in 179 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 180 %result = add <2 x i32> %a, %b 181 store <2 x i32> %result, ptr addrspace(1) %out 182 ret void 183} 184 185define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 186; GFX6-LABEL: s_add_v4i32: 187; GFX6: ; %bb.0: 188; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 189; GFX6-NEXT: s_waitcnt lgkmcnt(0) 190; GFX6-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 191; GFX6-NEXT: s_mov_b32 s11, 0xf000 192; GFX6-NEXT: s_mov_b32 s10, -1 193; GFX6-NEXT: s_waitcnt lgkmcnt(0) 194; GFX6-NEXT: s_add_i32 s3, s3, s7 195; GFX6-NEXT: s_add_i32 s2, s2, s6 196; GFX6-NEXT: s_add_i32 s1, s1, s5 197; GFX6-NEXT: s_add_i32 s0, s0, s4 198; GFX6-NEXT: v_mov_b32_e32 v0, s0 199; GFX6-NEXT: v_mov_b32_e32 v1, s1 200; GFX6-NEXT: v_mov_b32_e32 v2, s2 201; GFX6-NEXT: v_mov_b32_e32 v3, s3 202; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 203; GFX6-NEXT: s_endpgm 204; 205; GFX8-LABEL: s_add_v4i32: 206; GFX8: ; %bb.0: 207; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 208; GFX8-NEXT: s_waitcnt lgkmcnt(0) 209; GFX8-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 210; GFX8-NEXT: v_mov_b32_e32 v4, s8 211; GFX8-NEXT: v_mov_b32_e32 v5, s9 212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 213; GFX8-NEXT: s_add_i32 s3, s3, s7 214; GFX8-NEXT: s_add_i32 s2, s2, s6 215; GFX8-NEXT: s_add_i32 s1, s1, s5 216; GFX8-NEXT: s_add_i32 s0, s0, s4 217; GFX8-NEXT: v_mov_b32_e32 v0, s0 218; GFX8-NEXT: v_mov_b32_e32 v1, s1 219; GFX8-NEXT: v_mov_b32_e32 v2, s2 220; GFX8-NEXT: v_mov_b32_e32 v3, s3 221; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 222; GFX8-NEXT: s_endpgm 223; 224; GFX9-LABEL: s_add_v4i32: 225; GFX9: ; %bb.0: 226; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 227; GFX9-NEXT: v_mov_b32_e32 v4, 0 228; GFX9-NEXT: s_waitcnt lgkmcnt(0) 229; GFX9-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 230; GFX9-NEXT: s_waitcnt lgkmcnt(0) 231; GFX9-NEXT: s_add_i32 s3, s3, s7 232; GFX9-NEXT: s_add_i32 s2, s2, s6 233; GFX9-NEXT: s_add_i32 s1, s1, s5 234; GFX9-NEXT: s_add_i32 s0, s0, s4 235; GFX9-NEXT: v_mov_b32_e32 v0, s0 236; GFX9-NEXT: v_mov_b32_e32 v1, s1 237; GFX9-NEXT: v_mov_b32_e32 v2, s2 238; GFX9-NEXT: v_mov_b32_e32 v3, s3 239; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 240; GFX9-NEXT: s_endpgm 241; 242; GFX10-LABEL: s_add_v4i32: 243; GFX10: ; %bb.0: 244; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 245; GFX10-NEXT: v_mov_b32_e32 v4, 0 246; GFX10-NEXT: s_waitcnt lgkmcnt(0) 247; GFX10-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 248; GFX10-NEXT: s_waitcnt lgkmcnt(0) 249; GFX10-NEXT: s_add_i32 s3, s3, s7 250; GFX10-NEXT: s_add_i32 s2, s2, s6 251; GFX10-NEXT: s_add_i32 s0, s0, s4 252; GFX10-NEXT: s_add_i32 s1, s1, s5 253; GFX10-NEXT: v_mov_b32_e32 v0, s0 254; GFX10-NEXT: v_mov_b32_e32 v1, s1 255; GFX10-NEXT: v_mov_b32_e32 v2, s2 256; GFX10-NEXT: v_mov_b32_e32 v3, s3 257; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 258; GFX10-NEXT: s_endpgm 259; 260; GFX11-LABEL: s_add_v4i32: 261; GFX11: ; %bb.0: 262; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 263; GFX11-NEXT: s_waitcnt lgkmcnt(0) 264; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 265; GFX11-NEXT: s_waitcnt lgkmcnt(0) 266; GFX11-NEXT: s_add_i32 s3, s3, s7 267; GFX11-NEXT: s_add_i32 s2, s2, s6 268; GFX11-NEXT: s_add_i32 s0, s0, s4 269; GFX11-NEXT: s_add_i32 s1, s1, s5 270; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 271; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 272; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 273; GFX11-NEXT: v_mov_b32_e32 v2, s2 274; GFX11-NEXT: global_store_b128 v4, v[0:3], s[8:9] 275; GFX11-NEXT: s_endpgm 276; 277; GFX12-LABEL: s_add_v4i32: 278; GFX12: ; %bb.0: 279; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 280; GFX12-NEXT: s_wait_kmcnt 0x0 281; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 282; GFX12-NEXT: s_wait_kmcnt 0x0 283; GFX12-NEXT: s_add_co_i32 s3, s3, s7 284; GFX12-NEXT: s_add_co_i32 s2, s2, s6 285; GFX12-NEXT: s_add_co_i32 s0, s0, s4 286; GFX12-NEXT: s_add_co_i32 s1, s1, s5 287; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 288; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 289; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 290; GFX12-NEXT: v_mov_b32_e32 v2, s2 291; GFX12-NEXT: global_store_b128 v4, v[0:3], s[8:9] 292; GFX12-NEXT: s_endpgm 293 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 294 %a = load <4 x i32>, ptr addrspace(1) %in 295 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 296 %result = add <4 x i32> %a, %b 297 store <4 x i32> %result, ptr addrspace(1) %out 298 ret void 299} 300 301define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) { 302; GFX6-LABEL: s_add_v8i32: 303; GFX6: ; %bb.0: ; %entry 304; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 305; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 306; GFX6-NEXT: s_mov_b32 s3, 0xf000 307; GFX6-NEXT: s_mov_b32 s2, -1 308; GFX6-NEXT: s_waitcnt lgkmcnt(0) 309; GFX6-NEXT: s_add_i32 s4, s11, s19 310; GFX6-NEXT: s_add_i32 s5, s10, s18 311; GFX6-NEXT: s_add_i32 s6, s9, s17 312; GFX6-NEXT: s_add_i32 s7, s8, s16 313; GFX6-NEXT: s_add_i32 s8, s15, s23 314; GFX6-NEXT: s_add_i32 s9, s14, s22 315; GFX6-NEXT: s_add_i32 s10, s13, s21 316; GFX6-NEXT: s_add_i32 s11, s12, s20 317; GFX6-NEXT: v_mov_b32_e32 v0, s11 318; GFX6-NEXT: v_mov_b32_e32 v1, s10 319; GFX6-NEXT: v_mov_b32_e32 v2, s9 320; GFX6-NEXT: v_mov_b32_e32 v3, s8 321; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 322; GFX6-NEXT: s_waitcnt expcnt(0) 323; GFX6-NEXT: v_mov_b32_e32 v0, s7 324; GFX6-NEXT: v_mov_b32_e32 v1, s6 325; GFX6-NEXT: v_mov_b32_e32 v2, s5 326; GFX6-NEXT: v_mov_b32_e32 v3, s4 327; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 328; GFX6-NEXT: s_endpgm 329; 330; GFX8-LABEL: s_add_v8i32: 331; GFX8: ; %bb.0: ; %entry 332; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 333; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 334; GFX8-NEXT: s_waitcnt lgkmcnt(0) 335; GFX8-NEXT: s_add_i32 s4, s11, s19 336; GFX8-NEXT: s_add_i32 s5, s10, s18 337; GFX8-NEXT: s_add_i32 s6, s9, s17 338; GFX8-NEXT: s_add_i32 s7, s8, s16 339; GFX8-NEXT: s_add_i32 s2, s15, s23 340; GFX8-NEXT: s_add_i32 s3, s14, s22 341; GFX8-NEXT: s_add_i32 s8, s13, s21 342; GFX8-NEXT: s_add_i32 s9, s12, s20 343; GFX8-NEXT: v_mov_b32_e32 v3, s2 344; GFX8-NEXT: s_add_u32 s2, s0, 16 345; GFX8-NEXT: v_mov_b32_e32 v2, s3 346; GFX8-NEXT: s_addc_u32 s3, s1, 0 347; GFX8-NEXT: v_mov_b32_e32 v5, s3 348; GFX8-NEXT: v_mov_b32_e32 v0, s9 349; GFX8-NEXT: v_mov_b32_e32 v1, s8 350; GFX8-NEXT: v_mov_b32_e32 v4, s2 351; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 352; GFX8-NEXT: v_mov_b32_e32 v5, s1 353; GFX8-NEXT: v_mov_b32_e32 v0, s7 354; GFX8-NEXT: v_mov_b32_e32 v1, s6 355; GFX8-NEXT: v_mov_b32_e32 v2, s5 356; GFX8-NEXT: v_mov_b32_e32 v3, s4 357; GFX8-NEXT: v_mov_b32_e32 v4, s0 358; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 359; GFX8-NEXT: s_endpgm 360; 361; GFX9-LABEL: s_add_v8i32: 362; GFX9: ; %bb.0: ; %entry 363; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 364; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 365; GFX9-NEXT: v_mov_b32_e32 v4, 0 366; GFX9-NEXT: s_waitcnt lgkmcnt(0) 367; GFX9-NEXT: s_add_i32 s4, s9, s17 368; GFX9-NEXT: s_add_i32 s5, s8, s16 369; GFX9-NEXT: s_add_i32 s6, s15, s23 370; GFX9-NEXT: s_add_i32 s7, s14, s22 371; GFX9-NEXT: s_add_i32 s8, s13, s21 372; GFX9-NEXT: s_add_i32 s9, s12, s20 373; GFX9-NEXT: s_add_i32 s2, s11, s19 374; GFX9-NEXT: s_add_i32 s3, s10, s18 375; GFX9-NEXT: v_mov_b32_e32 v0, s9 376; GFX9-NEXT: v_mov_b32_e32 v1, s8 377; GFX9-NEXT: v_mov_b32_e32 v2, s7 378; GFX9-NEXT: v_mov_b32_e32 v3, s6 379; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 380; GFX9-NEXT: s_nop 0 381; GFX9-NEXT: v_mov_b32_e32 v0, s5 382; GFX9-NEXT: v_mov_b32_e32 v1, s4 383; GFX9-NEXT: v_mov_b32_e32 v2, s3 384; GFX9-NEXT: v_mov_b32_e32 v3, s2 385; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 386; GFX9-NEXT: s_endpgm 387; 388; GFX10-LABEL: s_add_v8i32: 389; GFX10: ; %bb.0: ; %entry 390; GFX10-NEXT: s_clause 0x1 391; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 392; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 393; GFX10-NEXT: v_mov_b32_e32 v8, 0 394; GFX10-NEXT: s_waitcnt lgkmcnt(0) 395; GFX10-NEXT: s_add_i32 s4, s9, s17 396; GFX10-NEXT: s_add_i32 s5, s8, s16 397; GFX10-NEXT: s_add_i32 s6, s15, s23 398; GFX10-NEXT: s_add_i32 s7, s14, s22 399; GFX10-NEXT: s_add_i32 s8, s12, s20 400; GFX10-NEXT: s_add_i32 s9, s13, s21 401; GFX10-NEXT: s_add_i32 s2, s11, s19 402; GFX10-NEXT: s_add_i32 s3, s10, s18 403; GFX10-NEXT: v_mov_b32_e32 v0, s8 404; GFX10-NEXT: v_mov_b32_e32 v1, s9 405; GFX10-NEXT: v_mov_b32_e32 v2, s7 406; GFX10-NEXT: v_mov_b32_e32 v3, s6 407; GFX10-NEXT: v_mov_b32_e32 v4, s5 408; GFX10-NEXT: v_mov_b32_e32 v5, s4 409; GFX10-NEXT: v_mov_b32_e32 v6, s3 410; GFX10-NEXT: v_mov_b32_e32 v7, s2 411; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 412; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] 413; GFX10-NEXT: s_endpgm 414; 415; GFX11-LABEL: s_add_v8i32: 416; GFX11: ; %bb.0: ; %entry 417; GFX11-NEXT: s_clause 0x1 418; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 419; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 420; GFX11-NEXT: s_waitcnt lgkmcnt(0) 421; GFX11-NEXT: s_add_i32 s4, s9, s17 422; GFX11-NEXT: s_add_i32 s5, s8, s16 423; GFX11-NEXT: s_add_i32 s6, s15, s23 424; GFX11-NEXT: s_add_i32 s7, s14, s22 425; GFX11-NEXT: s_add_i32 s8, s12, s20 426; GFX11-NEXT: s_add_i32 s9, s13, s21 427; GFX11-NEXT: s_add_i32 s2, s11, s19 428; GFX11-NEXT: s_add_i32 s3, s10, s18 429; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 430; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 431; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 432; GFX11-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 433; GFX11-NEXT: v_mov_b32_e32 v6, s3 434; GFX11-NEXT: s_clause 0x1 435; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 436; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] 437; GFX11-NEXT: s_endpgm 438; 439; GFX12-LABEL: s_add_v8i32: 440; GFX12: ; %bb.0: ; %entry 441; GFX12-NEXT: s_clause 0x1 442; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 443; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 444; GFX12-NEXT: s_wait_kmcnt 0x0 445; GFX12-NEXT: s_add_co_i32 s4, s9, s17 446; GFX12-NEXT: s_add_co_i32 s5, s8, s16 447; GFX12-NEXT: s_add_co_i32 s6, s15, s23 448; GFX12-NEXT: s_add_co_i32 s7, s14, s22 449; GFX12-NEXT: s_add_co_i32 s8, s12, s20 450; GFX12-NEXT: s_add_co_i32 s9, s13, s21 451; GFX12-NEXT: s_add_co_i32 s2, s11, s19 452; GFX12-NEXT: s_add_co_i32 s3, s10, s18 453; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9 454; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6 455; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4 456; GFX12-NEXT: v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2 457; GFX12-NEXT: v_mov_b32_e32 v6, s3 458; GFX12-NEXT: s_clause 0x1 459; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16 460; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] 461; GFX12-NEXT: s_endpgm 462entry: 463 %0 = add <8 x i32> %a, %b 464 store <8 x i32> %0, ptr addrspace(1) %out 465 ret void 466} 467 468define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) { 469; GFX6-LABEL: s_add_v16i32: 470; GFX6: ; %bb.0: ; %entry 471; GFX6-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 472; GFX6-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x29 473; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 474; GFX6-NEXT: s_mov_b32 s3, 0xf000 475; GFX6-NEXT: s_mov_b32 s2, -1 476; GFX6-NEXT: s_waitcnt lgkmcnt(0) 477; GFX6-NEXT: s_add_i32 s6, s11, s39 478; GFX6-NEXT: s_add_i32 s7, s10, s38 479; GFX6-NEXT: s_add_i32 s10, s15, s43 480; GFX6-NEXT: s_add_i32 s11, s14, s42 481; GFX6-NEXT: s_add_i32 s14, s19, s47 482; GFX6-NEXT: s_add_i32 s15, s18, s46 483; GFX6-NEXT: s_add_i32 s18, s23, s51 484; GFX6-NEXT: s_add_i32 s19, s22, s50 485; GFX6-NEXT: s_add_i32 s21, s21, s49 486; GFX6-NEXT: s_add_i32 s20, s20, s48 487; GFX6-NEXT: s_add_i32 s17, s17, s45 488; GFX6-NEXT: s_add_i32 s16, s16, s44 489; GFX6-NEXT: v_mov_b32_e32 v0, s20 490; GFX6-NEXT: v_mov_b32_e32 v1, s21 491; GFX6-NEXT: v_mov_b32_e32 v2, s19 492; GFX6-NEXT: v_mov_b32_e32 v3, s18 493; GFX6-NEXT: s_add_i32 s13, s13, s41 494; GFX6-NEXT: s_add_i32 s12, s12, s40 495; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 496; GFX6-NEXT: s_waitcnt expcnt(0) 497; GFX6-NEXT: v_mov_b32_e32 v0, s16 498; GFX6-NEXT: v_mov_b32_e32 v1, s17 499; GFX6-NEXT: v_mov_b32_e32 v2, s15 500; GFX6-NEXT: v_mov_b32_e32 v3, s14 501; GFX6-NEXT: s_add_i32 s9, s9, s37 502; GFX6-NEXT: s_add_i32 s8, s8, s36 503; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 504; GFX6-NEXT: s_waitcnt expcnt(0) 505; GFX6-NEXT: v_mov_b32_e32 v0, s12 506; GFX6-NEXT: v_mov_b32_e32 v1, s13 507; GFX6-NEXT: v_mov_b32_e32 v2, s11 508; GFX6-NEXT: v_mov_b32_e32 v3, s10 509; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 510; GFX6-NEXT: s_waitcnt expcnt(0) 511; GFX6-NEXT: v_mov_b32_e32 v0, s8 512; GFX6-NEXT: v_mov_b32_e32 v1, s9 513; GFX6-NEXT: v_mov_b32_e32 v2, s7 514; GFX6-NEXT: v_mov_b32_e32 v3, s6 515; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 516; GFX6-NEXT: s_endpgm 517; 518; GFX8-LABEL: s_add_v16i32: 519; GFX8: ; %bb.0: ; %entry 520; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 521; GFX8-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 522; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 523; GFX8-NEXT: s_waitcnt lgkmcnt(0) 524; GFX8-NEXT: s_add_i32 s4, s11, s39 525; GFX8-NEXT: s_add_i32 s5, s10, s38 526; GFX8-NEXT: s_add_i32 s6, s9, s37 527; GFX8-NEXT: s_add_i32 s7, s8, s36 528; GFX8-NEXT: s_add_i32 s8, s15, s43 529; GFX8-NEXT: s_add_i32 s9, s14, s42 530; GFX8-NEXT: s_add_i32 s10, s13, s41 531; GFX8-NEXT: s_add_i32 s11, s12, s40 532; GFX8-NEXT: s_add_i32 s12, s19, s47 533; GFX8-NEXT: s_add_i32 s13, s18, s46 534; GFX8-NEXT: s_add_i32 s14, s17, s45 535; GFX8-NEXT: s_add_i32 s15, s16, s44 536; GFX8-NEXT: s_add_i32 s2, s23, s51 537; GFX8-NEXT: s_add_i32 s3, s22, s50 538; GFX8-NEXT: s_add_i32 s16, s21, s49 539; GFX8-NEXT: s_add_i32 s17, s20, s48 540; GFX8-NEXT: v_mov_b32_e32 v3, s2 541; GFX8-NEXT: s_add_u32 s2, s0, 48 542; GFX8-NEXT: v_mov_b32_e32 v2, s3 543; GFX8-NEXT: s_addc_u32 s3, s1, 0 544; GFX8-NEXT: v_mov_b32_e32 v5, s3 545; GFX8-NEXT: v_mov_b32_e32 v4, s2 546; GFX8-NEXT: s_add_u32 s2, s0, 32 547; GFX8-NEXT: v_mov_b32_e32 v0, s17 548; GFX8-NEXT: v_mov_b32_e32 v1, s16 549; GFX8-NEXT: s_addc_u32 s3, s1, 0 550; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 551; GFX8-NEXT: v_mov_b32_e32 v5, s3 552; GFX8-NEXT: v_mov_b32_e32 v4, s2 553; GFX8-NEXT: s_add_u32 s2, s0, 16 554; GFX8-NEXT: v_mov_b32_e32 v0, s15 555; GFX8-NEXT: v_mov_b32_e32 v1, s14 556; GFX8-NEXT: v_mov_b32_e32 v2, s13 557; GFX8-NEXT: v_mov_b32_e32 v3, s12 558; GFX8-NEXT: s_addc_u32 s3, s1, 0 559; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 560; GFX8-NEXT: v_mov_b32_e32 v5, s3 561; GFX8-NEXT: v_mov_b32_e32 v0, s11 562; GFX8-NEXT: v_mov_b32_e32 v1, s10 563; GFX8-NEXT: v_mov_b32_e32 v2, s9 564; GFX8-NEXT: v_mov_b32_e32 v3, s8 565; GFX8-NEXT: v_mov_b32_e32 v4, s2 566; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 567; GFX8-NEXT: v_mov_b32_e32 v5, s1 568; GFX8-NEXT: v_mov_b32_e32 v0, s7 569; GFX8-NEXT: v_mov_b32_e32 v1, s6 570; GFX8-NEXT: v_mov_b32_e32 v2, s5 571; GFX8-NEXT: v_mov_b32_e32 v3, s4 572; GFX8-NEXT: v_mov_b32_e32 v4, s0 573; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 574; GFX8-NEXT: s_endpgm 575; 576; GFX9-LABEL: s_add_v16i32: 577; GFX9: ; %bb.0: ; %entry 578; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 579; GFX9-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 580; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 581; GFX9-NEXT: v_mov_b32_e32 v4, 0 582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 583; GFX9-NEXT: s_add_i32 s4, s9, s37 584; GFX9-NEXT: s_add_i32 s5, s8, s36 585; GFX9-NEXT: s_add_i32 s6, s15, s43 586; GFX9-NEXT: s_add_i32 s7, s14, s42 587; GFX9-NEXT: s_add_i32 s8, s13, s41 588; GFX9-NEXT: s_add_i32 s9, s12, s40 589; GFX9-NEXT: s_add_i32 s12, s17, s45 590; GFX9-NEXT: s_add_i32 s13, s16, s44 591; GFX9-NEXT: s_add_i32 s14, s23, s51 592; GFX9-NEXT: s_add_i32 s15, s22, s50 593; GFX9-NEXT: s_add_i32 s16, s21, s49 594; GFX9-NEXT: s_add_i32 s17, s20, s48 595; GFX9-NEXT: s_add_i32 s2, s11, s39 596; GFX9-NEXT: s_add_i32 s3, s10, s38 597; GFX9-NEXT: s_add_i32 s10, s19, s47 598; GFX9-NEXT: s_add_i32 s11, s18, s46 599; GFX9-NEXT: v_mov_b32_e32 v0, s17 600; GFX9-NEXT: v_mov_b32_e32 v1, s16 601; GFX9-NEXT: v_mov_b32_e32 v2, s15 602; GFX9-NEXT: v_mov_b32_e32 v3, s14 603; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 604; GFX9-NEXT: s_nop 0 605; GFX9-NEXT: v_mov_b32_e32 v0, s13 606; GFX9-NEXT: v_mov_b32_e32 v1, s12 607; GFX9-NEXT: v_mov_b32_e32 v2, s11 608; GFX9-NEXT: v_mov_b32_e32 v3, s10 609; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 610; GFX9-NEXT: s_nop 0 611; GFX9-NEXT: v_mov_b32_e32 v0, s9 612; GFX9-NEXT: v_mov_b32_e32 v1, s8 613; GFX9-NEXT: v_mov_b32_e32 v2, s7 614; GFX9-NEXT: v_mov_b32_e32 v3, s6 615; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 616; GFX9-NEXT: s_nop 0 617; GFX9-NEXT: v_mov_b32_e32 v0, s5 618; GFX9-NEXT: v_mov_b32_e32 v1, s4 619; GFX9-NEXT: v_mov_b32_e32 v2, s3 620; GFX9-NEXT: v_mov_b32_e32 v3, s2 621; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 622; GFX9-NEXT: s_endpgm 623; 624; GFX10-LABEL: s_add_v16i32: 625; GFX10: ; %bb.0: ; %entry 626; GFX10-NEXT: s_clause 0x2 627; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 628; GFX10-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0xa4 629; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 630; GFX10-NEXT: v_mov_b32_e32 v16, 0 631; GFX10-NEXT: s_waitcnt lgkmcnt(0) 632; GFX10-NEXT: s_add_i32 s4, s9, s37 633; GFX10-NEXT: s_add_i32 s5, s8, s36 634; GFX10-NEXT: s_add_i32 s6, s15, s43 635; GFX10-NEXT: s_add_i32 s7, s14, s42 636; GFX10-NEXT: s_add_i32 s8, s13, s41 637; GFX10-NEXT: s_add_i32 s9, s12, s40 638; GFX10-NEXT: s_add_i32 s12, s17, s45 639; GFX10-NEXT: s_add_i32 s13, s16, s44 640; GFX10-NEXT: s_add_i32 s14, s23, s51 641; GFX10-NEXT: s_add_i32 s15, s22, s50 642; GFX10-NEXT: s_add_i32 s16, s20, s48 643; GFX10-NEXT: s_add_i32 s17, s21, s49 644; GFX10-NEXT: s_add_i32 s2, s11, s39 645; GFX10-NEXT: s_add_i32 s3, s10, s38 646; GFX10-NEXT: s_add_i32 s10, s19, s47 647; GFX10-NEXT: s_add_i32 s11, s18, s46 648; GFX10-NEXT: v_mov_b32_e32 v0, s16 649; GFX10-NEXT: v_mov_b32_e32 v1, s17 650; GFX10-NEXT: v_mov_b32_e32 v2, s15 651; GFX10-NEXT: v_mov_b32_e32 v3, s14 652; GFX10-NEXT: v_mov_b32_e32 v4, s13 653; GFX10-NEXT: v_mov_b32_e32 v5, s12 654; GFX10-NEXT: v_mov_b32_e32 v6, s11 655; GFX10-NEXT: v_mov_b32_e32 v7, s10 656; GFX10-NEXT: v_mov_b32_e32 v8, s9 657; GFX10-NEXT: v_mov_b32_e32 v9, s8 658; GFX10-NEXT: v_mov_b32_e32 v10, s7 659; GFX10-NEXT: v_mov_b32_e32 v11, s6 660; GFX10-NEXT: v_mov_b32_e32 v12, s5 661; GFX10-NEXT: v_mov_b32_e32 v13, s4 662; GFX10-NEXT: v_mov_b32_e32 v14, s3 663; GFX10-NEXT: v_mov_b32_e32 v15, s2 664; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48 665; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32 666; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16 667; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] 668; GFX10-NEXT: s_endpgm 669; 670; GFX11-LABEL: s_add_v16i32: 671; GFX11: ; %bb.0: ; %entry 672; GFX11-NEXT: s_clause 0x2 673; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 674; GFX11-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 675; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 676; GFX11-NEXT: s_waitcnt lgkmcnt(0) 677; GFX11-NEXT: s_add_i32 s4, s9, s37 678; GFX11-NEXT: s_add_i32 s5, s8, s36 679; GFX11-NEXT: s_add_i32 s6, s15, s43 680; GFX11-NEXT: s_add_i32 s7, s14, s42 681; GFX11-NEXT: s_add_i32 s8, s13, s41 682; GFX11-NEXT: s_add_i32 s9, s12, s40 683; GFX11-NEXT: s_add_i32 s12, s17, s45 684; GFX11-NEXT: s_add_i32 s13, s16, s44 685; GFX11-NEXT: s_add_i32 s14, s23, s51 686; GFX11-NEXT: s_add_i32 s15, s22, s50 687; GFX11-NEXT: s_add_i32 s16, s20, s48 688; GFX11-NEXT: s_add_i32 s17, s21, s49 689; GFX11-NEXT: s_add_i32 s2, s11, s39 690; GFX11-NEXT: s_add_i32 s3, s10, s38 691; GFX11-NEXT: s_add_i32 s10, s19, s47 692; GFX11-NEXT: s_add_i32 s11, s18, s46 693; GFX11-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 694; GFX11-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 695; GFX11-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12 696; GFX11-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10 697; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8 698; GFX11-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6 699; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4 700; GFX11-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2 701; GFX11-NEXT: v_mov_b32_e32 v14, s3 702; GFX11-NEXT: s_clause 0x3 703; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 704; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 705; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 706; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1] 707; GFX11-NEXT: s_endpgm 708; 709; GFX12-LABEL: s_add_v16i32: 710; GFX12: ; %bb.0: ; %entry 711; GFX12-NEXT: s_clause 0x2 712; GFX12-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 713; GFX12-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 714; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 715; GFX12-NEXT: s_wait_kmcnt 0x0 716; GFX12-NEXT: s_add_co_i32 s4, s9, s37 717; GFX12-NEXT: s_add_co_i32 s5, s8, s36 718; GFX12-NEXT: s_add_co_i32 s6, s15, s43 719; GFX12-NEXT: s_add_co_i32 s7, s14, s42 720; GFX12-NEXT: s_add_co_i32 s8, s13, s41 721; GFX12-NEXT: s_add_co_i32 s9, s12, s40 722; GFX12-NEXT: s_add_co_i32 s12, s17, s45 723; GFX12-NEXT: s_add_co_i32 s13, s16, s44 724; GFX12-NEXT: s_add_co_i32 s14, s23, s51 725; GFX12-NEXT: s_add_co_i32 s15, s22, s50 726; GFX12-NEXT: s_add_co_i32 s16, s20, s48 727; GFX12-NEXT: s_add_co_i32 s17, s21, s49 728; GFX12-NEXT: s_add_co_i32 s2, s11, s39 729; GFX12-NEXT: s_add_co_i32 s3, s10, s38 730; GFX12-NEXT: s_add_co_i32 s10, s19, s47 731; GFX12-NEXT: s_add_co_i32 s11, s18, s46 732; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17 733; GFX12-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14 734; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12 735; GFX12-NEXT: v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10 736; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8 737; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6 738; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4 739; GFX12-NEXT: v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2 740; GFX12-NEXT: v_mov_b32_e32 v14, s3 741; GFX12-NEXT: s_clause 0x3 742; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 743; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32 744; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 745; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] 746; GFX12-NEXT: s_endpgm 747entry: 748 %0 = add <16 x i32> %a, %b 749 store <16 x i32> %0, ptr addrspace(1) %out 750 ret void 751} 752 753define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 754; GFX6-LABEL: v_add_i32: 755; GFX6: ; %bb.0: 756; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 757; GFX6-NEXT: s_mov_b32 s7, 0xf000 758; GFX6-NEXT: s_mov_b32 s10, 0 759; GFX6-NEXT: s_mov_b32 s11, s7 760; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 761; GFX6-NEXT: s_waitcnt lgkmcnt(0) 762; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] 763; GFX6-NEXT: v_mov_b32_e32 v1, 0 764; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 765; GFX6-NEXT: s_waitcnt vmcnt(0) 766; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc 767; GFX6-NEXT: s_waitcnt vmcnt(0) 768; GFX6-NEXT: s_mov_b32 s6, -1 769; GFX6-NEXT: s_mov_b32 s4, s0 770; GFX6-NEXT: s_mov_b32 s5, s1 771; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 772; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 773; GFX6-NEXT: s_endpgm 774; 775; GFX8-LABEL: v_add_i32: 776; GFX8: ; %bb.0: 777; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 778; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 779; GFX8-NEXT: s_waitcnt lgkmcnt(0) 780; GFX8-NEXT: v_mov_b32_e32 v1, s3 781; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 782; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 783; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 784; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 785; GFX8-NEXT: flat_load_dword v4, v[0:1] glc 786; GFX8-NEXT: s_waitcnt vmcnt(0) 787; GFX8-NEXT: flat_load_dword v2, v[2:3] glc 788; GFX8-NEXT: s_waitcnt vmcnt(0) 789; GFX8-NEXT: v_mov_b32_e32 v0, s0 790; GFX8-NEXT: v_mov_b32_e32 v1, s1 791; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 792; GFX8-NEXT: flat_store_dword v[0:1], v2 793; GFX8-NEXT: s_endpgm 794; 795; GFX9-LABEL: v_add_i32: 796; GFX9: ; %bb.0: 797; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 798; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 800; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 801; GFX9-NEXT: s_waitcnt vmcnt(0) 802; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc 803; GFX9-NEXT: s_waitcnt vmcnt(0) 804; GFX9-NEXT: v_mov_b32_e32 v0, 0 805; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 806; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 807; GFX9-NEXT: s_endpgm 808; 809; GFX10-LABEL: v_add_i32: 810; GFX10: ; %bb.0: 811; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 812; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 813; GFX10-NEXT: s_waitcnt lgkmcnt(0) 814; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 815; GFX10-NEXT: s_waitcnt vmcnt(0) 816; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc 817; GFX10-NEXT: s_waitcnt vmcnt(0) 818; GFX10-NEXT: v_mov_b32_e32 v0, 0 819; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 820; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 821; GFX10-NEXT: s_endpgm 822; 823; GFX11-LABEL: v_add_i32: 824; GFX11: ; %bb.0: 825; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 826; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 827; GFX11-NEXT: v_mov_b32_e32 v2, 0 828; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 829; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 830; GFX11-NEXT: s_waitcnt lgkmcnt(0) 831; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 832; GFX11-NEXT: s_waitcnt vmcnt(0) 833; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc 834; GFX11-NEXT: s_waitcnt vmcnt(0) 835; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0 836; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 837; GFX11-NEXT: s_endpgm 838; 839; GFX12-LABEL: v_add_i32: 840; GFX12: ; %bb.0: 841; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 842; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 843; GFX12-NEXT: v_mov_b32_e32 v2, 0 844; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 845; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 846; GFX12-NEXT: s_wait_kmcnt 0x0 847; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS 848; GFX12-NEXT: s_wait_loadcnt 0x0 849; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS 850; GFX12-NEXT: s_wait_loadcnt 0x0 851; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0 852; GFX12-NEXT: global_store_b32 v2, v0, s[0:1] 853; GFX12-NEXT: s_endpgm 854 %tid = call i32 @llvm.amdgcn.workitem.id.x() 855 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid 856 %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1 857 %a = load volatile i32, ptr addrspace(1) %gep 858 %b = load volatile i32, ptr addrspace(1) %b_ptr 859 %result = add i32 %a, %b 860 store i32 %result, ptr addrspace(1) %out 861 ret void 862} 863 864define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 865; GFX6-LABEL: v_add_imm_i32: 866; GFX6: ; %bb.0: 867; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 868; GFX6-NEXT: s_mov_b32 s7, 0xf000 869; GFX6-NEXT: s_mov_b32 s10, 0 870; GFX6-NEXT: s_mov_b32 s11, s7 871; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 872; GFX6-NEXT: s_waitcnt lgkmcnt(0) 873; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3] 874; GFX6-NEXT: v_mov_b32_e32 v1, 0 875; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc 876; GFX6-NEXT: s_waitcnt vmcnt(0) 877; GFX6-NEXT: s_mov_b32 s6, -1 878; GFX6-NEXT: s_mov_b32 s4, s0 879; GFX6-NEXT: s_mov_b32 s5, s1 880; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0x7b, v0 881; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 882; GFX6-NEXT: s_endpgm 883; 884; GFX8-LABEL: v_add_imm_i32: 885; GFX8: ; %bb.0: 886; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 887; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 888; GFX8-NEXT: s_waitcnt lgkmcnt(0) 889; GFX8-NEXT: v_mov_b32_e32 v1, s3 890; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 891; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 892; GFX8-NEXT: flat_load_dword v2, v[0:1] glc 893; GFX8-NEXT: s_waitcnt vmcnt(0) 894; GFX8-NEXT: v_mov_b32_e32 v0, s0 895; GFX8-NEXT: v_mov_b32_e32 v1, s1 896; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2 897; GFX8-NEXT: flat_store_dword v[0:1], v2 898; GFX8-NEXT: s_endpgm 899; 900; GFX9-LABEL: v_add_imm_i32: 901; GFX9: ; %bb.0: 902; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 903; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 904; GFX9-NEXT: v_mov_b32_e32 v1, 0 905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 906; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 907; GFX9-NEXT: s_waitcnt vmcnt(0) 908; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0 909; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 910; GFX9-NEXT: s_endpgm 911; 912; GFX10-LABEL: v_add_imm_i32: 913; GFX10: ; %bb.0: 914; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 915; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 916; GFX10-NEXT: v_mov_b32_e32 v1, 0 917; GFX10-NEXT: s_waitcnt lgkmcnt(0) 918; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 919; GFX10-NEXT: s_waitcnt vmcnt(0) 920; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 921; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 922; GFX10-NEXT: s_endpgm 923; 924; GFX11-LABEL: v_add_imm_i32: 925; GFX11: ; %bb.0: 926; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 927; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 928; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 929; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 930; GFX11-NEXT: s_waitcnt lgkmcnt(0) 931; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 932; GFX11-NEXT: s_waitcnt vmcnt(0) 933; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 934; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 935; GFX11-NEXT: s_endpgm 936; 937; GFX12-LABEL: v_add_imm_i32: 938; GFX12: ; %bb.0: 939; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 940; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 941; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 942; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 943; GFX12-NEXT: s_wait_kmcnt 0x0 944; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS 945; GFX12-NEXT: s_wait_loadcnt 0x0 946; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0 947; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] 948; GFX12-NEXT: s_endpgm 949 %tid = call i32 @llvm.amdgcn.workitem.id.x() 950 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid 951 %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1 952 %a = load volatile i32, ptr addrspace(1) %gep 953 %result = add i32 %a, 123 954 store i32 %result, ptr addrspace(1) %out 955 ret void 956} 957 958define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) { 959; GFX6-LABEL: add64: 960; GFX6: ; %bb.0: ; %entry 961; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 962; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 963; GFX6-NEXT: s_mov_b32 s7, 0xf000 964; GFX6-NEXT: s_mov_b32 s6, -1 965; GFX6-NEXT: s_waitcnt lgkmcnt(0) 966; GFX6-NEXT: s_mov_b32 s4, s0 967; GFX6-NEXT: s_add_u32 s0, s2, s8 968; GFX6-NEXT: s_mov_b32 s5, s1 969; GFX6-NEXT: s_addc_u32 s1, s3, s9 970; GFX6-NEXT: v_mov_b32_e32 v0, s0 971; GFX6-NEXT: v_mov_b32_e32 v1, s1 972; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 973; GFX6-NEXT: s_endpgm 974; 975; GFX8-LABEL: add64: 976; GFX8: ; %bb.0: ; %entry 977; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 978; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 979; GFX8-NEXT: s_waitcnt lgkmcnt(0) 980; GFX8-NEXT: v_mov_b32_e32 v0, s0 981; GFX8-NEXT: s_add_u32 s0, s2, s4 982; GFX8-NEXT: v_mov_b32_e32 v1, s1 983; GFX8-NEXT: s_addc_u32 s1, s3, s5 984; GFX8-NEXT: v_mov_b32_e32 v3, s1 985; GFX8-NEXT: v_mov_b32_e32 v2, s0 986; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 987; GFX8-NEXT: s_endpgm 988; 989; GFX9-LABEL: add64: 990; GFX9: ; %bb.0: ; %entry 991; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 992; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 993; GFX9-NEXT: v_mov_b32_e32 v2, 0 994; GFX9-NEXT: s_waitcnt lgkmcnt(0) 995; GFX9-NEXT: s_add_u32 s2, s2, s6 996; GFX9-NEXT: s_addc_u32 s3, s3, s7 997; GFX9-NEXT: v_mov_b32_e32 v0, s2 998; GFX9-NEXT: v_mov_b32_e32 v1, s3 999; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1000; GFX9-NEXT: s_endpgm 1001; 1002; GFX10-LABEL: add64: 1003; GFX10: ; %bb.0: ; %entry 1004; GFX10-NEXT: s_clause 0x1 1005; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1006; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1007; GFX10-NEXT: v_mov_b32_e32 v2, 0 1008; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX10-NEXT: s_add_u32 s2, s2, s6 1010; GFX10-NEXT: s_addc_u32 s3, s3, s7 1011; GFX10-NEXT: v_mov_b32_e32 v0, s2 1012; GFX10-NEXT: v_mov_b32_e32 v1, s3 1013; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1014; GFX10-NEXT: s_endpgm 1015; 1016; GFX11-LABEL: add64: 1017; GFX11: ; %bb.0: ; %entry 1018; GFX11-NEXT: s_clause 0x1 1019; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1020; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1021; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1022; GFX11-NEXT: s_add_u32 s2, s2, s4 1023; GFX11-NEXT: s_addc_u32 s3, s3, s5 1024; GFX11-NEXT: v_mov_b32_e32 v0, s2 1025; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 1026; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1027; GFX11-NEXT: s_endpgm 1028; 1029; GFX12-LABEL: add64: 1030; GFX12: ; %bb.0: ; %entry 1031; GFX12-NEXT: s_clause 0x1 1032; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1033; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1034; GFX12-NEXT: s_wait_kmcnt 0x0 1035; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] 1036; GFX12-NEXT: v_mov_b32_e32 v2, 0 1037; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1038; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1039; GFX12-NEXT: s_endpgm 1040entry: 1041 %add = add i64 %a, %b 1042 store i64 %add, ptr addrspace(1) %out 1043 ret void 1044} 1045 1046; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they 1047; use VCC. The test is designed so that %a will be stored in an SGPR and 1048; %0 will be stored in a VGPR, so the comiler will be forced to copy %a 1049; to a VGPR before doing the add. 1050define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) { 1051; GFX6-LABEL: add64_sgpr_vgpr: 1052; GFX6: ; %bb.0: ; %entry 1053; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 1054; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1055; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1056; GFX6-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 1057; GFX6-NEXT: s_mov_b32 s4, s0 1058; GFX6-NEXT: s_mov_b32 s5, s1 1059; GFX6-NEXT: s_mov_b32 s7, 0xf000 1060; GFX6-NEXT: s_mov_b32 s6, -1 1061; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1062; GFX6-NEXT: s_add_u32 s0, s2, s8 1063; GFX6-NEXT: s_addc_u32 s1, s3, s9 1064; GFX6-NEXT: v_mov_b32_e32 v0, s0 1065; GFX6-NEXT: v_mov_b32_e32 v1, s1 1066; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1067; GFX6-NEXT: s_endpgm 1068; 1069; GFX8-LABEL: add64_sgpr_vgpr: 1070; GFX8: ; %bb.0: ; %entry 1071; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1072; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1073; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1074; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1075; GFX8-NEXT: v_mov_b32_e32 v0, s0 1076; GFX8-NEXT: v_mov_b32_e32 v1, s1 1077; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1078; GFX8-NEXT: s_add_u32 s0, s2, s4 1079; GFX8-NEXT: s_addc_u32 s1, s3, s5 1080; GFX8-NEXT: v_mov_b32_e32 v3, s1 1081; GFX8-NEXT: v_mov_b32_e32 v2, s0 1082; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1083; GFX8-NEXT: s_endpgm 1084; 1085; GFX9-LABEL: add64_sgpr_vgpr: 1086; GFX9: ; %bb.0: ; %entry 1087; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1088; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1089; GFX9-NEXT: v_mov_b32_e32 v2, 0 1090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1093; GFX9-NEXT: s_add_u32 s2, s2, s4 1094; GFX9-NEXT: s_addc_u32 s3, s3, s5 1095; GFX9-NEXT: v_mov_b32_e32 v0, s2 1096; GFX9-NEXT: v_mov_b32_e32 v1, s3 1097; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1098; GFX9-NEXT: s_endpgm 1099; 1100; GFX10-LABEL: add64_sgpr_vgpr: 1101; GFX10: ; %bb.0: ; %entry 1102; GFX10-NEXT: s_clause 0x1 1103; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1104; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1105; GFX10-NEXT: v_mov_b32_e32 v2, 0 1106; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1107; GFX10-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 1108; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1109; GFX10-NEXT: s_add_u32 s2, s2, s4 1110; GFX10-NEXT: s_addc_u32 s3, s3, s5 1111; GFX10-NEXT: v_mov_b32_e32 v0, s2 1112; GFX10-NEXT: v_mov_b32_e32 v1, s3 1113; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1114; GFX10-NEXT: s_endpgm 1115; 1116; GFX11-LABEL: add64_sgpr_vgpr: 1117; GFX11: ; %bb.0: ; %entry 1118; GFX11-NEXT: s_clause 0x1 1119; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1120; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1121; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1122; GFX11-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 1123; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX11-NEXT: s_add_u32 s2, s2, s4 1125; GFX11-NEXT: s_addc_u32 s3, s3, s5 1126; GFX11-NEXT: v_mov_b32_e32 v0, s2 1127; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 1128; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1129; GFX11-NEXT: s_endpgm 1130; 1131; GFX12-LABEL: add64_sgpr_vgpr: 1132; GFX12: ; %bb.0: ; %entry 1133; GFX12-NEXT: s_clause 0x1 1134; GFX12-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 1135; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1136; GFX12-NEXT: s_wait_kmcnt 0x0 1137; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0 1138; GFX12-NEXT: s_wait_kmcnt 0x0 1139; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] 1140; GFX12-NEXT: v_mov_b32_e32 v2, 0 1141; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 1142; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1143; GFX12-NEXT: s_endpgm 1144entry: 1145 %0 = load i64, ptr addrspace(1) %in 1146 %1 = add i64 %a, %0 1147 store i64 %1, ptr addrspace(1) %out 1148 ret void 1149} 1150 1151; Test i64 add inside a branch. 1152define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { 1153; GFX6-LABEL: add64_in_branch: 1154; GFX6: ; %bb.0: ; %entry 1155; GFX6-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1156; GFX6-NEXT: s_mov_b64 s[8:9], 0 1157; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1158; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 1159; GFX6-NEXT: s_and_b64 vcc, exec, s[10:11] 1160; GFX6-NEXT: s_cbranch_vccz .LBB9_4 1161; GFX6-NEXT: ; %bb.1: ; %else 1162; GFX6-NEXT: s_add_u32 s4, s4, s6 1163; GFX6-NEXT: s_addc_u32 s5, s5, s7 1164; GFX6-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1165; GFX6-NEXT: s_cbranch_vccnz .LBB9_3 1166; GFX6-NEXT: .LBB9_2: ; %if 1167; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 1168; GFX6-NEXT: .LBB9_3: ; %endif 1169; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1170; GFX6-NEXT: v_mov_b32_e32 v0, s4 1171; GFX6-NEXT: s_mov_b32 s3, 0xf000 1172; GFX6-NEXT: s_mov_b32 s2, -1 1173; GFX6-NEXT: v_mov_b32_e32 v1, s5 1174; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1175; GFX6-NEXT: s_endpgm 1176; GFX6-NEXT: .LBB9_4: 1177; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5 1178; GFX6-NEXT: s_branch .LBB9_2 1179; 1180; GFX8-LABEL: add64_in_branch: 1181; GFX8: ; %bb.0: ; %entry 1182; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1183; GFX8-NEXT: s_mov_b64 s[8:9], 0 1184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0 1186; GFX8-NEXT: s_cbranch_scc0 .LBB9_4 1187; GFX8-NEXT: ; %bb.1: ; %else 1188; GFX8-NEXT: s_add_u32 s4, s4, s6 1189; GFX8-NEXT: s_addc_u32 s5, s5, s7 1190; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9] 1191; GFX8-NEXT: s_cbranch_vccnz .LBB9_3 1192; GFX8-NEXT: .LBB9_2: ; %if 1193; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 1194; GFX8-NEXT: .LBB9_3: ; %endif 1195; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1196; GFX8-NEXT: v_mov_b32_e32 v2, s4 1197; GFX8-NEXT: v_mov_b32_e32 v0, s0 1198; GFX8-NEXT: v_mov_b32_e32 v1, s1 1199; GFX8-NEXT: v_mov_b32_e32 v3, s5 1200; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1201; GFX8-NEXT: s_endpgm 1202; GFX8-NEXT: .LBB9_4: 1203; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5 1204; GFX8-NEXT: s_branch .LBB9_2 1205; 1206; GFX9-LABEL: add64_in_branch: 1207; GFX9: ; %bb.0: ; %entry 1208; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1209; GFX9-NEXT: s_mov_b64 s[2:3], 0 1210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1211; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 1212; GFX9-NEXT: s_cbranch_scc0 .LBB9_4 1213; GFX9-NEXT: ; %bb.1: ; %else 1214; GFX9-NEXT: s_add_u32 s0, s12, s14 1215; GFX9-NEXT: s_addc_u32 s1, s13, s15 1216; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3] 1217; GFX9-NEXT: s_cbranch_vccnz .LBB9_3 1218; GFX9-NEXT: .LBB9_2: ; %if 1219; GFX9-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 1220; GFX9-NEXT: .LBB9_3: ; %endif 1221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1222; GFX9-NEXT: v_mov_b32_e32 v0, s0 1223; GFX9-NEXT: v_mov_b32_e32 v2, 0 1224; GFX9-NEXT: v_mov_b32_e32 v1, s1 1225; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 1226; GFX9-NEXT: s_endpgm 1227; GFX9-NEXT: .LBB9_4: 1228; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1 1229; GFX9-NEXT: s_branch .LBB9_2 1230; 1231; GFX10-LABEL: add64_in_branch: 1232; GFX10: ; %bb.0: ; %entry 1233; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 1234; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1235; GFX10-NEXT: s_cmp_lg_u64 s[12:13], 0 1236; GFX10-NEXT: s_cbranch_scc0 .LBB9_4 1237; GFX10-NEXT: ; %bb.1: ; %else 1238; GFX10-NEXT: s_add_u32 s0, s12, s14 1239; GFX10-NEXT: s_addc_u32 s1, s13, s15 1240; GFX10-NEXT: s_cbranch_execnz .LBB9_3 1241; GFX10-NEXT: .LBB9_2: ; %if 1242; GFX10-NEXT: s_load_dwordx2 s[0:1], s[10:11], 0x0 1243; GFX10-NEXT: .LBB9_3: ; %endif 1244; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1245; GFX10-NEXT: v_mov_b32_e32 v0, s0 1246; GFX10-NEXT: v_mov_b32_e32 v2, 0 1247; GFX10-NEXT: v_mov_b32_e32 v1, s1 1248; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] 1249; GFX10-NEXT: s_endpgm 1250; GFX10-NEXT: .LBB9_4: 1251; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1 1252; GFX10-NEXT: s_branch .LBB9_2 1253; 1254; GFX11-LABEL: add64_in_branch: 1255; GFX11: ; %bb.0: ; %entry 1256; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1257; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1258; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 1259; GFX11-NEXT: s_cbranch_scc0 .LBB9_4 1260; GFX11-NEXT: ; %bb.1: ; %else 1261; GFX11-NEXT: s_add_u32 s4, s4, s6 1262; GFX11-NEXT: s_addc_u32 s5, s5, s7 1263; GFX11-NEXT: s_cbranch_execnz .LBB9_3 1264; GFX11-NEXT: .LBB9_2: ; %if 1265; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 1266; GFX11-NEXT: .LBB9_3: ; %endif 1267; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1268; GFX11-NEXT: v_mov_b32_e32 v0, s4 1269; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 1270; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1271; GFX11-NEXT: s_endpgm 1272; GFX11-NEXT: .LBB9_4: 1273; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 1274; GFX11-NEXT: s_branch .LBB9_2 1275; 1276; GFX12-LABEL: add64_in_branch: 1277; GFX12: ; %bb.0: ; %entry 1278; GFX12-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 1279; GFX12-NEXT: s_wait_kmcnt 0x0 1280; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 1281; GFX12-NEXT: s_cbranch_scc0 .LBB9_4 1282; GFX12-NEXT: ; %bb.1: ; %else 1283; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7] 1284; GFX12-NEXT: s_cbranch_execnz .LBB9_3 1285; GFX12-NEXT: .LBB9_2: ; %if 1286; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 1287; GFX12-NEXT: .LBB9_3: ; %endif 1288; GFX12-NEXT: s_wait_kmcnt 0x0 1289; GFX12-NEXT: v_mov_b32_e32 v0, s4 1290; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5 1291; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1292; GFX12-NEXT: s_endpgm 1293; GFX12-NEXT: .LBB9_4: 1294; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5 1295; GFX12-NEXT: s_branch .LBB9_2 1296entry: 1297 %0 = icmp eq i64 %a, 0 1298 br i1 %0, label %if, label %else 1299 1300if: 1301 %1 = load i64, ptr addrspace(1) %in 1302 br label %endif 1303 1304else: 1305 %2 = add i64 %a, %b 1306 br label %endif 1307 1308endif: 1309 %3 = phi i64 [%1, %if], [%2, %else] 1310 store i64 %3, ptr addrspace(1) %out 1311 ret void 1312} 1313 1314; Make sure the VOP3 form of add is initially selected. Otherwise pair 1315; of opies from/to VCC would be necessary 1316define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) { 1317; GFX6-LABEL: add_select_vop3: 1318; GFX6: ; %bb.0: 1319; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], s0, v0 1320; GFX6-NEXT: s_mov_b32 m0, -1 1321; GFX6-NEXT: ;;#ASMSTART 1322; GFX6-NEXT: ; def vcc 1323; GFX6-NEXT: ;;#ASMEND 1324; GFX6-NEXT: ds_write_b32 v0, v0 1325; GFX6-NEXT: ;;#ASMSTART 1326; GFX6-NEXT: ; use vcc 1327; GFX6-NEXT: ;;#ASMEND 1328; GFX6-NEXT: s_endpgm 1329; 1330; GFX8-LABEL: add_select_vop3: 1331; GFX8: ; %bb.0: 1332; GFX8-NEXT: v_add_u32_e64 v0, s[0:1], s0, v0 1333; GFX8-NEXT: s_mov_b32 m0, -1 1334; GFX8-NEXT: ;;#ASMSTART 1335; GFX8-NEXT: ; def vcc 1336; GFX8-NEXT: ;;#ASMEND 1337; GFX8-NEXT: ds_write_b32 v0, v0 1338; GFX8-NEXT: ;;#ASMSTART 1339; GFX8-NEXT: ; use vcc 1340; GFX8-NEXT: ;;#ASMEND 1341; GFX8-NEXT: s_endpgm 1342; 1343; GFX9-LABEL: add_select_vop3: 1344; GFX9: ; %bb.0: 1345; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 1346; GFX9-NEXT: ;;#ASMSTART 1347; GFX9-NEXT: ; def vcc 1348; GFX9-NEXT: ;;#ASMEND 1349; GFX9-NEXT: ds_write_b32 v0, v0 1350; GFX9-NEXT: ;;#ASMSTART 1351; GFX9-NEXT: ; use vcc 1352; GFX9-NEXT: ;;#ASMEND 1353; GFX9-NEXT: s_endpgm 1354; 1355; GFX10-LABEL: add_select_vop3: 1356; GFX10: ; %bb.0: 1357; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 1358; GFX10-NEXT: ;;#ASMSTART 1359; GFX10-NEXT: ; def vcc 1360; GFX10-NEXT: ;;#ASMEND 1361; GFX10-NEXT: ds_write_b32 v0, v0 1362; GFX10-NEXT: ;;#ASMSTART 1363; GFX10-NEXT: ; use vcc 1364; GFX10-NEXT: ;;#ASMEND 1365; GFX10-NEXT: s_endpgm 1366; 1367; GFX11-LABEL: add_select_vop3: 1368; GFX11: ; %bb.0: 1369; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 1370; GFX11-NEXT: ;;#ASMSTART 1371; GFX11-NEXT: ; def vcc 1372; GFX11-NEXT: ;;#ASMEND 1373; GFX11-NEXT: ds_store_b32 v0, v0 1374; GFX11-NEXT: ;;#ASMSTART 1375; GFX11-NEXT: ; use vcc 1376; GFX11-NEXT: ;;#ASMEND 1377; GFX11-NEXT: s_endpgm 1378; 1379; GFX12-LABEL: add_select_vop3: 1380; GFX12: ; %bb.0: 1381; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 1382; GFX12-NEXT: ;;#ASMSTART 1383; GFX12-NEXT: ; def vcc 1384; GFX12-NEXT: ;;#ASMEND 1385; GFX12-NEXT: ds_store_b32 v0, v0 1386; GFX12-NEXT: ;;#ASMSTART 1387; GFX12-NEXT: ; use vcc 1388; GFX12-NEXT: ;;#ASMEND 1389; GFX12-NEXT: s_endpgm 1390 %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"() 1391 %sub = add i32 %v, %s 1392 store i32 %sub, ptr addrspace(3) undef 1393 call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc) 1394 ret void 1395} 1396 1397declare i32 @llvm.amdgcn.workitem.id.x() #1 1398 1399attributes #0 = { nounwind } 1400attributes #1 = { nounwind readnone speculatable } 1401