1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4 5define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 6; SI-LABEL: v_ubfe_sub_i32: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 9; SI-NEXT: s_mov_b32 s7, 0xf000 10; SI-NEXT: s_mov_b32 s6, 0 11; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 12; SI-NEXT: v_mov_b32_e32 v1, 0 13; SI-NEXT: s_waitcnt lgkmcnt(0) 14; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 15; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 16; SI-NEXT: s_waitcnt vmcnt(0) 17; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 20; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 21; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 22; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 23; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 24; SI-NEXT: s_endpgm 25; 26; VI-LABEL: v_ubfe_sub_i32: 27; VI: ; %bb.0: 28; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 29; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: v_mov_b32_e32 v1, s3 32; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 33; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 34; VI-NEXT: flat_load_dword v3, v[0:1] glc 35; VI-NEXT: s_waitcnt vmcnt(0) 36; VI-NEXT: flat_load_dword v4, v[0:1] glc 37; VI-NEXT: s_waitcnt vmcnt(0) 38; VI-NEXT: v_mov_b32_e32 v1, s1 39; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 40; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 41; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 42; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 43; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3 44; VI-NEXT: flat_store_dword v[0:1], v2 45; VI-NEXT: s_endpgm 46 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 47 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x 48 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x 49 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 50 %src = load volatile i32, ptr addrspace(1) %in0.gep 51 %width = load volatile i32, ptr addrspace(1) %in0.gep 52 %sub = sub i32 32, %width 53 %shl = shl i32 %src, %sub 54 %bfe = lshr i32 %shl, %sub 55 store i32 %bfe, ptr addrspace(1) %out.gep 56 ret void 57} 58 59define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 60; SI-LABEL: v_ubfe_sub_multi_use_shl_i32: 61; SI: ; %bb.0: 62; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 63; SI-NEXT: s_mov_b32 s6, 0 64; SI-NEXT: s_mov_b32 s7, 0xf000 65; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 66; SI-NEXT: v_mov_b32_e32 v1, 0 67; SI-NEXT: s_waitcnt lgkmcnt(0) 68; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 69; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 70; SI-NEXT: s_waitcnt vmcnt(0) 71; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 72; SI-NEXT: s_waitcnt vmcnt(0) 73; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 74; SI-NEXT: s_mov_b32 s6, -1 75; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 76; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 77; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2 78; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 79; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 80; SI-NEXT: s_waitcnt vmcnt(0) 81; SI-NEXT: s_endpgm 82; 83; VI-LABEL: v_ubfe_sub_multi_use_shl_i32: 84; VI: ; %bb.0: 85; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 86; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: v_mov_b32_e32 v1, s3 89; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 90; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 91; VI-NEXT: flat_load_dword v3, v[0:1] glc 92; VI-NEXT: s_waitcnt vmcnt(0) 93; VI-NEXT: flat_load_dword v4, v[0:1] glc 94; VI-NEXT: s_waitcnt vmcnt(0) 95; VI-NEXT: v_mov_b32_e32 v1, s1 96; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 97; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 98; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 99; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 100; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3 101; VI-NEXT: flat_store_dword v[0:1], v2 102; VI-NEXT: flat_store_dword v[0:1], v3 103; VI-NEXT: s_waitcnt vmcnt(0) 104; VI-NEXT: s_endpgm 105 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 106 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x 107 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x 108 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 109 %src = load volatile i32, ptr addrspace(1) %in0.gep 110 %width = load volatile i32, ptr addrspace(1) %in0.gep 111 %sub = sub i32 32, %width 112 %shl = shl i32 %src, %sub 113 %bfe = lshr i32 %shl, %sub 114 store i32 %bfe, ptr addrspace(1) %out.gep 115 store volatile i32 %shl, ptr addrspace(1) undef 116 ret void 117} 118 119define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { 120; SI-LABEL: s_ubfe_sub_i32: 121; SI: ; %bb.0: 122; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 123; SI-NEXT: s_mov_b32 s7, 0xf000 124; SI-NEXT: s_mov_b32 s6, 0 125; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; SI-NEXT: s_waitcnt lgkmcnt(0) 127; SI-NEXT: s_sub_i32 s3, 32, s3 128; SI-NEXT: s_lshl_b32 s2, s2, s3 129; SI-NEXT: s_lshr_b32 s2, s2, s3 130; SI-NEXT: v_mov_b32_e32 v1, 0 131; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 132; SI-NEXT: v_mov_b32_e32 v2, s2 133; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 134; SI-NEXT: s_endpgm 135; 136; VI-LABEL: s_ubfe_sub_i32: 137; VI: ; %bb.0: 138; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 139; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 140; VI-NEXT: s_waitcnt lgkmcnt(0) 141; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 142; VI-NEXT: s_sub_i32 s0, 32, s3 143; VI-NEXT: v_mov_b32_e32 v1, s1 144; VI-NEXT: s_lshl_b32 s1, s2, s0 145; VI-NEXT: s_lshr_b32 s0, s1, s0 146; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 147; VI-NEXT: v_mov_b32_e32 v2, s0 148; VI-NEXT: flat_store_dword v[0:1], v2 149; VI-NEXT: s_endpgm 150 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 151 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 152 %sub = sub i32 32, %width 153 %shl = shl i32 %src, %sub 154 %bfe = lshr i32 %shl, %sub 155 store i32 %bfe, ptr addrspace(1) %out.gep 156 ret void 157} 158 159define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { 160; SI-LABEL: s_ubfe_sub_multi_use_shl_i32: 161; SI: ; %bb.0: 162; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 163; SI-NEXT: s_mov_b32 s6, 0 164; SI-NEXT: s_mov_b32 s7, 0xf000 165; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 166; SI-NEXT: s_waitcnt lgkmcnt(0) 167; SI-NEXT: s_sub_i32 s3, 32, s3 168; SI-NEXT: s_lshl_b32 s2, s2, s3 169; SI-NEXT: s_lshr_b32 s3, s2, s3 170; SI-NEXT: v_mov_b32_e32 v1, 0 171; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 172; SI-NEXT: v_mov_b32_e32 v2, s3 173; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 174; SI-NEXT: s_mov_b32 s6, -1 175; SI-NEXT: v_mov_b32_e32 v0, s2 176; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 177; SI-NEXT: s_waitcnt vmcnt(0) 178; SI-NEXT: s_endpgm 179; 180; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: 181; VI: ; %bb.0: 182; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 183; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 184; VI-NEXT: s_waitcnt lgkmcnt(0) 185; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 186; VI-NEXT: s_sub_i32 s0, 32, s3 187; VI-NEXT: v_mov_b32_e32 v1, s1 188; VI-NEXT: s_lshl_b32 s1, s2, s0 189; VI-NEXT: s_lshr_b32 s0, s1, s0 190; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 191; VI-NEXT: v_mov_b32_e32 v2, s0 192; VI-NEXT: flat_store_dword v[0:1], v2 193; VI-NEXT: v_mov_b32_e32 v0, s1 194; VI-NEXT: flat_store_dword v[0:1], v0 195; VI-NEXT: s_waitcnt vmcnt(0) 196; VI-NEXT: s_endpgm 197 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 198 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 199 %sub = sub i32 32, %width 200 %shl = shl i32 %src, %sub 201 %bfe = lshr i32 %shl, %sub 202 store i32 %bfe, ptr addrspace(1) %out.gep 203 store volatile i32 %shl, ptr addrspace(1) undef 204 ret void 205} 206 207define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 208; SI-LABEL: v_sbfe_sub_i32: 209; SI: ; %bb.0: 210; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 211; SI-NEXT: s_mov_b32 s7, 0xf000 212; SI-NEXT: s_mov_b32 s6, 0 213; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; SI-NEXT: v_mov_b32_e32 v1, 0 215; SI-NEXT: s_waitcnt lgkmcnt(0) 216; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 217; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 218; SI-NEXT: s_waitcnt vmcnt(0) 219; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 220; SI-NEXT: s_waitcnt vmcnt(0) 221; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 222; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 223; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 224; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2 225; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 226; SI-NEXT: s_endpgm 227; 228; VI-LABEL: v_sbfe_sub_i32: 229; VI: ; %bb.0: 230; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 231; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 232; VI-NEXT: s_waitcnt lgkmcnt(0) 233; VI-NEXT: v_mov_b32_e32 v1, s3 234; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 235; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 236; VI-NEXT: flat_load_dword v3, v[0:1] glc 237; VI-NEXT: s_waitcnt vmcnt(0) 238; VI-NEXT: flat_load_dword v4, v[0:1] glc 239; VI-NEXT: s_waitcnt vmcnt(0) 240; VI-NEXT: v_mov_b32_e32 v1, s1 241; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 242; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 243; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 244; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 245; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3 246; VI-NEXT: flat_store_dword v[0:1], v2 247; VI-NEXT: s_endpgm 248 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 249 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x 250 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x 251 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 252 %src = load volatile i32, ptr addrspace(1) %in0.gep 253 %width = load volatile i32, ptr addrspace(1) %in0.gep 254 %sub = sub i32 32, %width 255 %shl = shl i32 %src, %sub 256 %bfe = ashr i32 %shl, %sub 257 store i32 %bfe, ptr addrspace(1) %out.gep 258 ret void 259} 260 261define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 262; SI-LABEL: v_sbfe_sub_multi_use_shl_i32: 263; SI: ; %bb.0: 264; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 265; SI-NEXT: s_mov_b32 s6, 0 266; SI-NEXT: s_mov_b32 s7, 0xf000 267; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 268; SI-NEXT: v_mov_b32_e32 v1, 0 269; SI-NEXT: s_waitcnt lgkmcnt(0) 270; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 271; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc 272; SI-NEXT: s_waitcnt vmcnt(0) 273; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 274; SI-NEXT: s_waitcnt vmcnt(0) 275; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 276; SI-NEXT: s_mov_b32 s6, -1 277; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 278; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 279; SI-NEXT: v_ashrrev_i32_e32 v3, v3, v2 280; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 281; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 282; SI-NEXT: s_waitcnt vmcnt(0) 283; SI-NEXT: s_endpgm 284; 285; VI-LABEL: v_sbfe_sub_multi_use_shl_i32: 286; VI: ; %bb.0: 287; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 288; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 289; VI-NEXT: s_waitcnt lgkmcnt(0) 290; VI-NEXT: v_mov_b32_e32 v1, s3 291; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 292; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 293; VI-NEXT: flat_load_dword v3, v[0:1] glc 294; VI-NEXT: s_waitcnt vmcnt(0) 295; VI-NEXT: flat_load_dword v4, v[0:1] glc 296; VI-NEXT: s_waitcnt vmcnt(0) 297; VI-NEXT: v_mov_b32_e32 v1, s1 298; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 299; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 300; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 301; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 302; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3 303; VI-NEXT: flat_store_dword v[0:1], v2 304; VI-NEXT: flat_store_dword v[0:1], v3 305; VI-NEXT: s_waitcnt vmcnt(0) 306; VI-NEXT: s_endpgm 307 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 308 %in0.gep = getelementptr i32, ptr addrspace(1) %in0, i32 %id.x 309 %in1.gep = getelementptr i32, ptr addrspace(1) %in1, i32 %id.x 310 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 311 %src = load volatile i32, ptr addrspace(1) %in0.gep 312 %width = load volatile i32, ptr addrspace(1) %in0.gep 313 %sub = sub i32 32, %width 314 %shl = shl i32 %src, %sub 315 %bfe = ashr i32 %shl, %sub 316 store i32 %bfe, ptr addrspace(1) %out.gep 317 store volatile i32 %shl, ptr addrspace(1) undef 318 ret void 319} 320 321define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { 322; SI-LABEL: s_sbfe_sub_i32: 323; SI: ; %bb.0: 324; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 325; SI-NEXT: s_mov_b32 s7, 0xf000 326; SI-NEXT: s_mov_b32 s6, 0 327; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; SI-NEXT: s_waitcnt lgkmcnt(0) 329; SI-NEXT: s_sub_i32 s3, 32, s3 330; SI-NEXT: s_lshl_b32 s2, s2, s3 331; SI-NEXT: s_ashr_i32 s2, s2, s3 332; SI-NEXT: v_mov_b32_e32 v1, 0 333; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 334; SI-NEXT: v_mov_b32_e32 v2, s2 335; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 336; SI-NEXT: s_endpgm 337; 338; VI-LABEL: s_sbfe_sub_i32: 339; VI: ; %bb.0: 340; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 341; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 342; VI-NEXT: s_waitcnt lgkmcnt(0) 343; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 344; VI-NEXT: s_sub_i32 s0, 32, s3 345; VI-NEXT: v_mov_b32_e32 v1, s1 346; VI-NEXT: s_lshl_b32 s1, s2, s0 347; VI-NEXT: s_ashr_i32 s0, s1, s0 348; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 349; VI-NEXT: v_mov_b32_e32 v2, s0 350; VI-NEXT: flat_store_dword v[0:1], v2 351; VI-NEXT: s_endpgm 352 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 353 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 354 %sub = sub i32 32, %width 355 %shl = shl i32 %src, %sub 356 %bfe = ashr i32 %shl, %sub 357 store i32 %bfe, ptr addrspace(1) %out.gep 358 ret void 359} 360 361define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i32 %src, i32 %width) #1 { 362; SI-LABEL: s_sbfe_sub_multi_use_shl_i32: 363; SI: ; %bb.0: 364; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 365; SI-NEXT: s_mov_b32 s6, 0 366; SI-NEXT: s_mov_b32 s7, 0xf000 367; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 368; SI-NEXT: s_waitcnt lgkmcnt(0) 369; SI-NEXT: s_sub_i32 s3, 32, s3 370; SI-NEXT: s_lshl_b32 s2, s2, s3 371; SI-NEXT: s_ashr_i32 s3, s2, s3 372; SI-NEXT: v_mov_b32_e32 v1, 0 373; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 374; SI-NEXT: v_mov_b32_e32 v2, s3 375; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 376; SI-NEXT: s_mov_b32 s6, -1 377; SI-NEXT: v_mov_b32_e32 v0, s2 378; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 379; SI-NEXT: s_waitcnt vmcnt(0) 380; SI-NEXT: s_endpgm 381; 382; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: 383; VI: ; %bb.0: 384; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 385; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; VI-NEXT: s_waitcnt lgkmcnt(0) 387; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 388; VI-NEXT: s_sub_i32 s0, 32, s3 389; VI-NEXT: v_mov_b32_e32 v1, s1 390; VI-NEXT: s_lshl_b32 s1, s2, s0 391; VI-NEXT: s_ashr_i32 s0, s1, s0 392; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 393; VI-NEXT: v_mov_b32_e32 v2, s0 394; VI-NEXT: flat_store_dword v[0:1], v2 395; VI-NEXT: v_mov_b32_e32 v0, s1 396; VI-NEXT: flat_store_dword v[0:1], v0 397; VI-NEXT: s_waitcnt vmcnt(0) 398; VI-NEXT: s_endpgm 399 %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() 400 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x 401 %sub = sub i32 32, %width 402 %shl = shl i32 %src, %sub 403 %bfe = ashr i32 %shl, %sub 404 store i32 %bfe, ptr addrspace(1) %out.gep 405 store volatile i32 %shl, ptr addrspace(1) undef 406 ret void 407} 408 409define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { 410; SI-LABEL: s_sbfe_or_shl_shl_uniform_i32: 411; SI: ; %bb.0: 412; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 413; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 414; SI-NEXT: s_waitcnt lgkmcnt(0) 415; SI-NEXT: s_load_dword s2, s[2:3], 0x0 416; SI-NEXT: s_load_dword s4, s[4:5], 0x0 417; SI-NEXT: s_mov_b32 s3, 0xf000 418; SI-NEXT: s_waitcnt lgkmcnt(0) 419; SI-NEXT: s_or_b32 s2, s2, s4 420; SI-NEXT: s_bfe_i32 s4, s2, 0xf0000 421; SI-NEXT: s_mov_b32 s2, -1 422; SI-NEXT: v_mov_b32_e32 v0, s4 423; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 424; SI-NEXT: s_endpgm 425; 426; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32: 427; VI: ; %bb.0: 428; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 429; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: s_load_dword s2, s[2:3], 0x0 432; VI-NEXT: s_load_dword s3, s[4:5], 0x0 433; VI-NEXT: v_mov_b32_e32 v0, s0 434; VI-NEXT: v_mov_b32_e32 v1, s1 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: s_or_b32 s0, s2, s3 437; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000 438; VI-NEXT: v_mov_b32_e32 v2, s0 439; VI-NEXT: flat_store_dword v[0:1], v2 440; VI-NEXT: s_endpgm 441 %a0 = load i32, ptr addrspace(1) %in0 442 %b0 = load i32, ptr addrspace(1) %in1 443 %a1 = shl i32 %a0, 17 444 %b1 = shl i32 %b0, 17 445 %or = or i32 %a1, %b1 446 %result = ashr i32 %or, 17 447 store i32 %result, ptr addrspace(1) %out 448 ret void 449} 450 451; TODO ashr(or(shl(x,c1),shl(y,c2)),c1) -> sign_extend_inreg(or(x,shl(y,c2-c1))) iff c2 >= c1 452define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { 453; SI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: 454; SI: ; %bb.0: 455; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 456; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 457; SI-NEXT: s_waitcnt lgkmcnt(0) 458; SI-NEXT: s_load_dword s2, s[2:3], 0x0 459; SI-NEXT: s_load_dword s4, s[4:5], 0x0 460; SI-NEXT: s_mov_b32 s3, 0xf000 461; SI-NEXT: s_waitcnt lgkmcnt(0) 462; SI-NEXT: s_lshl_b32 s2, s2, 17 463; SI-NEXT: s_lshl_b32 s4, s4, 19 464; SI-NEXT: s_or_b32 s2, s2, s4 465; SI-NEXT: s_ashr_i32 s4, s2, 17 466; SI-NEXT: s_mov_b32 s2, -1 467; SI-NEXT: v_mov_b32_e32 v0, s4 468; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 469; SI-NEXT: s_endpgm 470; 471; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32: 472; VI: ; %bb.0: 473; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 474; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 475; VI-NEXT: s_waitcnt lgkmcnt(0) 476; VI-NEXT: s_load_dword s2, s[2:3], 0x0 477; VI-NEXT: s_load_dword s3, s[4:5], 0x0 478; VI-NEXT: v_mov_b32_e32 v0, s0 479; VI-NEXT: v_mov_b32_e32 v1, s1 480; VI-NEXT: s_waitcnt lgkmcnt(0) 481; VI-NEXT: s_lshl_b32 s0, s2, 17 482; VI-NEXT: s_lshl_b32 s1, s3, 19 483; VI-NEXT: s_or_b32 s0, s0, s1 484; VI-NEXT: s_ashr_i32 s0, s0, 17 485; VI-NEXT: v_mov_b32_e32 v2, s0 486; VI-NEXT: flat_store_dword v[0:1], v2 487; VI-NEXT: s_endpgm 488 %a0 = load i32, ptr addrspace(1) %x 489 %b0 = load i32, ptr addrspace(1) %y 490 %a1 = shl i32 %a0, 17 491 %b1 = shl i32 %b0, 19 492 %or = or i32 %a1, %b1 493 %result = ashr i32 %or, 17 494 store i32 %result, ptr addrspace(1) %out 495 ret void 496} 497 498; Don't fold as 'other shl' amount is less than the sign_extend_inreg type. 499define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out, ptr addrspace(1) %x, ptr addrspace(1) %y) { 500; SI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: 501; SI: ; %bb.0: 502; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 503; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 504; SI-NEXT: s_waitcnt lgkmcnt(0) 505; SI-NEXT: s_load_dword s2, s[2:3], 0x0 506; SI-NEXT: s_load_dword s4, s[4:5], 0x0 507; SI-NEXT: s_mov_b32 s3, 0xf000 508; SI-NEXT: s_waitcnt lgkmcnt(0) 509; SI-NEXT: s_lshl_b32 s2, s2, 17 510; SI-NEXT: s_lshl_b32 s4, s4, 16 511; SI-NEXT: s_or_b32 s2, s2, s4 512; SI-NEXT: s_ashr_i32 s4, s2, 17 513; SI-NEXT: s_mov_b32 s2, -1 514; SI-NEXT: v_mov_b32_e32 v0, s4 515; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 516; SI-NEXT: s_endpgm 517; 518; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32: 519; VI: ; %bb.0: 520; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 521; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 522; VI-NEXT: s_waitcnt lgkmcnt(0) 523; VI-NEXT: s_load_dword s2, s[2:3], 0x0 524; VI-NEXT: s_load_dword s3, s[4:5], 0x0 525; VI-NEXT: v_mov_b32_e32 v0, s0 526; VI-NEXT: v_mov_b32_e32 v1, s1 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: s_lshl_b32 s0, s2, 17 529; VI-NEXT: s_lshl_b32 s1, s3, 16 530; VI-NEXT: s_or_b32 s0, s0, s1 531; VI-NEXT: s_ashr_i32 s0, s0, 17 532; VI-NEXT: v_mov_b32_e32 v2, s0 533; VI-NEXT: flat_store_dword v[0:1], v2 534; VI-NEXT: s_endpgm 535 %a0 = load i32, ptr addrspace(1) %x 536 %b0 = load i32, ptr addrspace(1) %y 537 %a1 = shl i32 %a0, 17 538 %b1 = shl i32 %b0, 16 539 %or = or i32 %a1, %b1 540 %result = ashr i32 %or, 17 541 store i32 %result, ptr addrspace(1) %out 542 ret void 543} 544 545declare i32 @llvm.amdgcn.workitem.id.x() #0 546 547attributes #0 = { nounwind readnone } 548attributes #1 = { nounwind } 549