1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefixes=SI,GCN,FUNC %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck --check-prefixes=VI,GCN,FUNC %s 4; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s 5 6define amdgpu_kernel void @local_size_x(ptr addrspace(1) %out) { 7; SI-LABEL: local_size_x: 8; SI: ; %bb.0: ; %entry 9; SI-NEXT: s_load_dword s6, s[4:5], 0x6 10; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 11; SI-NEXT: s_mov_b32 s3, 0xf000 12; SI-NEXT: s_mov_b32 s2, -1 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: v_mov_b32_e32 v0, s6 15; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 16; SI-NEXT: s_endpgm 17; 18; VI-LABEL: local_size_x: 19; VI: ; %bb.0: ; %entry 20; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 21; VI-NEXT: s_load_dword s2, s[4:5], 0x18 22; VI-NEXT: s_waitcnt lgkmcnt(0) 23; VI-NEXT: v_mov_b32_e32 v0, s0 24; VI-NEXT: v_mov_b32_e32 v1, s1 25; VI-NEXT: v_mov_b32_e32 v2, s2 26; VI-NEXT: flat_store_dword v[0:1], v2 27; VI-NEXT: s_endpgm 28; 29; R600-LABEL: local_size_x: 30; R600: ; %bb.0: ; %entry 31; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 32; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 33; R600-NEXT: CF_END 34; R600-NEXT: PAD 35; R600-NEXT: ALU clause starting at 4: 36; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 37; R600-NEXT: MOV * T1.X, KC0[1].Z, 38; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 39entry: 40 %0 = call i32 @llvm.r600.read.local.size.x() #0 41 store i32 %0, ptr addrspace(1) %out 42 ret void 43} 44 45define amdgpu_kernel void @local_size_y(ptr addrspace(1) %out) { 46; SI-LABEL: local_size_y: 47; SI: ; %bb.0: ; %entry 48; SI-NEXT: s_load_dword s6, s[4:5], 0x7 49; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 50; SI-NEXT: s_mov_b32 s3, 0xf000 51; SI-NEXT: s_mov_b32 s2, -1 52; SI-NEXT: s_waitcnt lgkmcnt(0) 53; SI-NEXT: v_mov_b32_e32 v0, s6 54; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 55; SI-NEXT: s_endpgm 56; 57; VI-LABEL: local_size_y: 58; VI: ; %bb.0: ; %entry 59; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 60; VI-NEXT: s_load_dword s2, s[4:5], 0x1c 61; VI-NEXT: s_waitcnt lgkmcnt(0) 62; VI-NEXT: v_mov_b32_e32 v0, s0 63; VI-NEXT: v_mov_b32_e32 v1, s1 64; VI-NEXT: v_mov_b32_e32 v2, s2 65; VI-NEXT: flat_store_dword v[0:1], v2 66; VI-NEXT: s_endpgm 67; 68; R600-LABEL: local_size_y: 69; R600: ; %bb.0: ; %entry 70; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 71; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 72; R600-NEXT: CF_END 73; R600-NEXT: PAD 74; R600-NEXT: ALU clause starting at 4: 75; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 76; R600-NEXT: MOV * T1.X, KC0[1].W, 77; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 78entry: 79 %0 = call i32 @llvm.r600.read.local.size.y() #0 80 store i32 %0, ptr addrspace(1) %out 81 ret void 82} 83 84define amdgpu_kernel void @local_size_z(ptr addrspace(1) %out) { 85; SI-LABEL: local_size_z: 86; SI: ; %bb.0: ; %entry 87; SI-NEXT: s_load_dword s6, s[4:5], 0x8 88; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 89; SI-NEXT: s_mov_b32 s3, 0xf000 90; SI-NEXT: s_mov_b32 s2, -1 91; SI-NEXT: s_waitcnt lgkmcnt(0) 92; SI-NEXT: v_mov_b32_e32 v0, s6 93; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 94; SI-NEXT: s_endpgm 95; 96; VI-LABEL: local_size_z: 97; VI: ; %bb.0: ; %entry 98; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 99; VI-NEXT: s_load_dword s2, s[4:5], 0x20 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: v_mov_b32_e32 v0, s0 102; VI-NEXT: v_mov_b32_e32 v1, s1 103; VI-NEXT: v_mov_b32_e32 v2, s2 104; VI-NEXT: flat_store_dword v[0:1], v2 105; VI-NEXT: s_endpgm 106; 107; R600-LABEL: local_size_z: 108; R600: ; %bb.0: ; %entry 109; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 110; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 111; R600-NEXT: CF_END 112; R600-NEXT: PAD 113; R600-NEXT: ALU clause starting at 4: 114; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 115; R600-NEXT: MOV * T1.X, KC0[2].X, 116; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 117entry: 118 %0 = call i32 @llvm.r600.read.local.size.z() #0 119 store i32 %0, ptr addrspace(1) %out 120 ret void 121} 122 123define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) { 124; SI-LABEL: local_size_xy: 125; SI: ; %bb.0: ; %entry 126; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6 127; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 128; SI-NEXT: s_mov_b32 s3, 0xf000 129; SI-NEXT: s_waitcnt lgkmcnt(0) 130; SI-NEXT: s_mul_i32 s4, s6, s7 131; SI-NEXT: s_mov_b32 s2, -1 132; SI-NEXT: v_mov_b32_e32 v0, s4 133; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 134; SI-NEXT: s_endpgm 135; 136; VI-LABEL: local_size_xy: 137; VI: ; %bb.0: ; %entry 138; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18 139; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 140; VI-NEXT: s_waitcnt lgkmcnt(0) 141; VI-NEXT: s_mul_i32 s0, s0, s1 142; VI-NEXT: v_mov_b32_e32 v0, s2 143; VI-NEXT: v_mov_b32_e32 v1, s3 144; VI-NEXT: v_mov_b32_e32 v2, s0 145; VI-NEXT: flat_store_dword v[0:1], v2 146; VI-NEXT: s_endpgm 147; 148; R600-LABEL: local_size_xy: 149; R600: ; %bb.0: ; %entry 150; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 151; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 152; R600-NEXT: CF_END 153; R600-NEXT: PAD 154; R600-NEXT: ALU clause starting at 4: 155; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 156; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[1].W, 157; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 158entry: 159 %x = call i32 @llvm.r600.read.local.size.x() #0 160 %y = call i32 @llvm.r600.read.local.size.y() #0 161 %val = mul i32 %x, %y 162 store i32 %val, ptr addrspace(1) %out 163 ret void 164} 165 166define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) { 167; SI-LABEL: local_size_xz: 168; SI: ; %bb.0: ; %entry 169; SI-NEXT: s_load_dword s2, s[4:5], 0x6 170; SI-NEXT: s_load_dword s6, s[4:5], 0x8 171; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 172; SI-NEXT: s_mov_b32 s3, 0xf000 173; SI-NEXT: s_waitcnt lgkmcnt(0) 174; SI-NEXT: s_mul_i32 s4, s2, s6 175; SI-NEXT: s_mov_b32 s2, -1 176; SI-NEXT: v_mov_b32_e32 v0, s4 177; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 178; SI-NEXT: s_endpgm 179; 180; VI-LABEL: local_size_xz: 181; VI: ; %bb.0: ; %entry 182; VI-NEXT: s_load_dword s2, s[4:5], 0x18 183; VI-NEXT: s_load_dword s3, s[4:5], 0x20 184; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 185; VI-NEXT: s_waitcnt lgkmcnt(0) 186; VI-NEXT: s_mul_i32 s2, s2, s3 187; VI-NEXT: v_mov_b32_e32 v0, s0 188; VI-NEXT: v_mov_b32_e32 v1, s1 189; VI-NEXT: v_mov_b32_e32 v2, s2 190; VI-NEXT: flat_store_dword v[0:1], v2 191; VI-NEXT: s_endpgm 192; 193; R600-LABEL: local_size_xz: 194; R600: ; %bb.0: ; %entry 195; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 196; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 197; R600-NEXT: CF_END 198; R600-NEXT: PAD 199; R600-NEXT: ALU clause starting at 4: 200; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 201; R600-NEXT: MULLO_INT * T1.X, KC0[1].Z, KC0[2].X, 202; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 203entry: 204 %x = call i32 @llvm.r600.read.local.size.x() #0 205 %z = call i32 @llvm.r600.read.local.size.z() #0 206 %val = mul i32 %x, %z 207 store i32 %val, ptr addrspace(1) %out 208 ret void 209} 210 211define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { 212; SI-LABEL: local_size_yz: 213; SI: ; %bb.0: ; %entry 214; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x7 215; SI-NEXT: s_mov_b32 s7, 0xf000 216; SI-NEXT: s_waitcnt lgkmcnt(0) 217; SI-NEXT: s_mul_i32 s0, s0, s1 218; SI-NEXT: s_mov_b32 s6, -1 219; SI-NEXT: s_mov_b32 s4, s2 220; SI-NEXT: s_mov_b32 s5, s3 221; SI-NEXT: v_mov_b32_e32 v0, s0 222; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 223; SI-NEXT: s_endpgm 224; 225; VI-LABEL: local_size_yz: 226; VI: ; %bb.0: ; %entry 227; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x1c 228; VI-NEXT: s_waitcnt lgkmcnt(0) 229; VI-NEXT: s_mul_i32 s0, s0, s1 230; VI-NEXT: v_mov_b32_e32 v0, s2 231; VI-NEXT: v_mov_b32_e32 v1, s3 232; VI-NEXT: v_mov_b32_e32 v2, s0 233; VI-NEXT: flat_store_dword v[0:1], v2 234; VI-NEXT: s_endpgm 235; 236; R600-LABEL: local_size_yz: 237; R600: ; %bb.0: ; %entry 238; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 239; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 240; R600-NEXT: CF_END 241; R600-NEXT: PAD 242; R600-NEXT: ALU clause starting at 4: 243; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 244; R600-NEXT: MULLO_INT * T1.X, KC0[1].W, KC0[2].X, 245; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 246entry: 247 %y = call i32 @llvm.r600.read.local.size.y() #0 248 %z = call i32 @llvm.r600.read.local.size.z() #0 249 %val = mul i32 %y, %z 250 store i32 %val, ptr addrspace(1) %out 251 ret void 252} 253 254define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) { 255; SI-LABEL: local_size_xyz: 256; SI: ; %bb.0: ; %entry 257; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x6 258; SI-NEXT: s_load_dword s2, s[4:5], 0x8 259; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 260; SI-NEXT: s_mov_b32 s3, 0xf000 261; SI-NEXT: s_waitcnt lgkmcnt(0) 262; SI-NEXT: s_mul_i32 s4, s6, s7 263; SI-NEXT: s_add_i32 s4, s4, s2 264; SI-NEXT: s_mov_b32 s2, -1 265; SI-NEXT: v_mov_b32_e32 v0, s4 266; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 267; SI-NEXT: s_endpgm 268; 269; VI-LABEL: local_size_xyz: 270; VI: ; %bb.0: ; %entry 271; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x18 272; VI-NEXT: s_load_dword s6, s[4:5], 0x20 273; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 274; VI-NEXT: s_waitcnt lgkmcnt(0) 275; VI-NEXT: s_mul_i32 s0, s0, s1 276; VI-NEXT: s_add_i32 s0, s0, s6 277; VI-NEXT: v_mov_b32_e32 v0, s2 278; VI-NEXT: v_mov_b32_e32 v1, s3 279; VI-NEXT: v_mov_b32_e32 v2, s0 280; VI-NEXT: flat_store_dword v[0:1], v2 281; VI-NEXT: s_endpgm 282; 283; R600-LABEL: local_size_xyz: 284; R600: ; %bb.0: ; %entry 285; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 286; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 287; R600-NEXT: CF_END 288; R600-NEXT: PAD 289; R600-NEXT: ALU clause starting at 4: 290; R600-NEXT: MULLO_INT * T0.X, KC0[1].Z, KC0[1].W, 291; R600-NEXT: ADD_INT T0.X, PS, KC0[2].X, 292; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 293; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 294entry: 295 %x = call i32 @llvm.r600.read.local.size.x() #0 296 %y = call i32 @llvm.r600.read.local.size.y() #0 297 %z = call i32 @llvm.r600.read.local.size.z() #0 298 %xy = mul i32 %x, %y 299 %xyz = add i32 %xy, %z 300 store i32 %xyz, ptr addrspace(1) %out 301 ret void 302} 303 304define amdgpu_kernel void @local_size_x_known_bits(ptr addrspace(1) %out) { 305; SI-LABEL: local_size_x_known_bits: 306; SI: ; %bb.0: ; %entry 307; SI-NEXT: s_load_dword s6, s[4:5], 0x6 308; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 309; SI-NEXT: s_mov_b32 s3, 0xf000 310; SI-NEXT: s_mov_b32 s2, -1 311; SI-NEXT: s_waitcnt lgkmcnt(0) 312; SI-NEXT: v_mov_b32_e32 v0, s6 313; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 314; SI-NEXT: s_endpgm 315; 316; VI-LABEL: local_size_x_known_bits: 317; VI: ; %bb.0: ; %entry 318; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 319; VI-NEXT: s_load_dword s2, s[4:5], 0x18 320; VI-NEXT: s_waitcnt lgkmcnt(0) 321; VI-NEXT: v_mov_b32_e32 v0, s0 322; VI-NEXT: v_mov_b32_e32 v1, s1 323; VI-NEXT: v_mov_b32_e32 v2, s2 324; VI-NEXT: flat_store_dword v[0:1], v2 325; VI-NEXT: s_endpgm 326; 327; R600-LABEL: local_size_x_known_bits: 328; R600: ; %bb.0: ; %entry 329; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 330; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 331; R600-NEXT: CF_END 332; R600-NEXT: PAD 333; R600-NEXT: ALU clause starting at 4: 334; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 335; R600-NEXT: AND_INT * T1.X, KC0[1].Z, literal.y, 336; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) 337entry: 338 %size = call i32 @llvm.r600.read.local.size.x() #0 339 %shl = shl i32 %size, 16 340 %shr = lshr i32 %shl, 16 341 store i32 %shr, ptr addrspace(1) %out 342 ret void 343} 344 345define amdgpu_kernel void @local_size_y_known_bits(ptr addrspace(1) %out) { 346; SI-LABEL: local_size_y_known_bits: 347; SI: ; %bb.0: ; %entry 348; SI-NEXT: s_load_dword s6, s[4:5], 0x7 349; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 350; SI-NEXT: s_mov_b32 s3, 0xf000 351; SI-NEXT: s_mov_b32 s2, -1 352; SI-NEXT: s_waitcnt lgkmcnt(0) 353; SI-NEXT: v_mov_b32_e32 v0, s6 354; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 355; SI-NEXT: s_endpgm 356; 357; VI-LABEL: local_size_y_known_bits: 358; VI: ; %bb.0: ; %entry 359; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 360; VI-NEXT: s_load_dword s2, s[4:5], 0x1c 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: v_mov_b32_e32 v0, s0 363; VI-NEXT: v_mov_b32_e32 v1, s1 364; VI-NEXT: v_mov_b32_e32 v2, s2 365; VI-NEXT: flat_store_dword v[0:1], v2 366; VI-NEXT: s_endpgm 367; 368; R600-LABEL: local_size_y_known_bits: 369; R600: ; %bb.0: ; %entry 370; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 371; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 372; R600-NEXT: CF_END 373; R600-NEXT: PAD 374; R600-NEXT: ALU clause starting at 4: 375; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 376; R600-NEXT: AND_INT * T1.X, KC0[1].W, literal.y, 377; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) 378entry: 379 %size = call i32 @llvm.r600.read.local.size.y() #0 380 %shl = shl i32 %size, 16 381 %shr = lshr i32 %shl, 16 382 store i32 %shr, ptr addrspace(1) %out 383 ret void 384} 385 386define amdgpu_kernel void @local_size_z_known_bits(ptr addrspace(1) %out) { 387; SI-LABEL: local_size_z_known_bits: 388; SI: ; %bb.0: ; %entry 389; SI-NEXT: s_load_dword s6, s[4:5], 0x8 390; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 391; SI-NEXT: s_mov_b32 s3, 0xf000 392; SI-NEXT: s_mov_b32 s2, -1 393; SI-NEXT: s_waitcnt lgkmcnt(0) 394; SI-NEXT: v_mov_b32_e32 v0, s6 395; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 396; SI-NEXT: s_endpgm 397; 398; VI-LABEL: local_size_z_known_bits: 399; VI: ; %bb.0: ; %entry 400; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 401; VI-NEXT: s_load_dword s2, s[4:5], 0x20 402; VI-NEXT: s_waitcnt lgkmcnt(0) 403; VI-NEXT: v_mov_b32_e32 v0, s0 404; VI-NEXT: v_mov_b32_e32 v1, s1 405; VI-NEXT: v_mov_b32_e32 v2, s2 406; VI-NEXT: flat_store_dword v[0:1], v2 407; VI-NEXT: s_endpgm 408; 409; R600-LABEL: local_size_z_known_bits: 410; R600: ; %bb.0: ; %entry 411; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 412; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 413; R600-NEXT: CF_END 414; R600-NEXT: PAD 415; R600-NEXT: ALU clause starting at 4: 416; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 417; R600-NEXT: AND_INT * T1.X, KC0[2].X, literal.y, 418; R600-NEXT: 2(2.802597e-45), 65535(9.183409e-41) 419entry: 420 %size = call i32 @llvm.r600.read.local.size.z() #0 421 %shl = shl i32 %size, 16 422 %shr = lshr i32 %shl, 16 423 store i32 %shr, ptr addrspace(1) %out 424 ret void 425} 426 427declare i32 @llvm.r600.read.local.size.x() #0 428declare i32 @llvm.r600.read.local.size.y() #0 429declare i32 @llvm.r600.read.local.size.z() #0 430 431attributes #0 = { nounwind readnone } 432;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 433; FUNC: {{.*}} 434; GCN: {{.*}} 435