1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s 3 4define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) { 5; GCN-LABEL: divergent_or3_b32: 6; GCN: ; %bb.0: ; %bb 7; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 8; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 9; GCN-NEXT: s_waitcnt lgkmcnt(0) 10; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] 11; GCN-NEXT: s_waitcnt vmcnt(0) 12; GCN-NEXT: v_or3_b32 v0, v1, v0, v2 13; GCN-NEXT: v_not_b32_e32 v0, v0 14; GCN-NEXT: global_store_dword v3, v0, s[0:1] 15; GCN-NEXT: s_endpgm 16bb: 17 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 18 %i1 = zext i32 %i to i64 19 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1 20 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16 21 %i4 = extractelement <3 x i32> %i3, i64 0 22 %i5 = extractelement <3 x i32> %i3, i64 1 23 %i6 = extractelement <3 x i32> %i3, i64 2 24 %i7 = or i32 %i5, %i4 25 %i8 = or i32 %i7, %i6 26 %i9 = xor i32 %i8, -1 27 store i32 %i9, ptr addrspace(1) %i2, align 16 28 ret void 29} 30 31define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) { 32; GCN-LABEL: divergent_or3_b64: 33; GCN: ; %bb.0: ; %bb 34; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 35; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 36; GCN-NEXT: s_waitcnt lgkmcnt(0) 37; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 38; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] 39; GCN-NEXT: s_waitcnt vmcnt(0) 40; GCN-NEXT: v_or3_b32 v1, v3, v1, v5 41; GCN-NEXT: v_or3_b32 v0, v2, v0, v4 42; GCN-NEXT: v_not_b32_e32 v1, v1 43; GCN-NEXT: v_not_b32_e32 v0, v0 44; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 45; GCN-NEXT: s_endpgm 46bb: 47 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 48 %i1 = zext i32 %i to i64 49 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1 50 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32 51 %i4 = extractelement <3 x i64> %i3, i64 0 52 %i5 = extractelement <3 x i64> %i3, i64 1 53 %i6 = extractelement <3 x i64> %i3, i64 2 54 %i7 = or i64 %i5, %i4 55 %i8 = or i64 %i7, %i6 56 %i9 = xor i64 %i8, -1 57 store i64 %i9, ptr addrspace(1) %i2, align 32 58 ret void 59} 60 61define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) { 62; GCN-LABEL: divergent_and3_b32: 63; GCN: ; %bb.0: ; %bb 64; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 65; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 66; GCN-NEXT: s_waitcnt lgkmcnt(0) 67; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] 68; GCN-NEXT: s_waitcnt vmcnt(0) 69; GCN-NEXT: v_and_b32_e32 v0, v1, v0 70; GCN-NEXT: v_and_b32_e32 v0, v0, v2 71; GCN-NEXT: v_not_b32_e32 v0, v0 72; GCN-NEXT: global_store_dword v3, v0, s[0:1] 73; GCN-NEXT: s_endpgm 74bb: 75 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 76 %i1 = zext i32 %i to i64 77 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1 78 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16 79 %i4 = extractelement <3 x i32> %i3, i64 0 80 %i5 = extractelement <3 x i32> %i3, i64 1 81 %i6 = extractelement <3 x i32> %i3, i64 2 82 %i7 = and i32 %i5, %i4 83 %i8 = and i32 %i7, %i6 84 %i9 = xor i32 %i8, -1 85 store i32 %i9, ptr addrspace(1) %i2, align 16 86 ret void 87} 88 89define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) { 90; GCN-LABEL: divergent_and3_b64: 91; GCN: ; %bb.0: ; %bb 92; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 93; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 94; GCN-NEXT: s_waitcnt lgkmcnt(0) 95; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] 96; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 97; GCN-NEXT: s_waitcnt vmcnt(1) 98; GCN-NEXT: v_and_b32_e32 v1, v3, v1 99; GCN-NEXT: v_and_b32_e32 v0, v2, v0 100; GCN-NEXT: s_waitcnt vmcnt(0) 101; GCN-NEXT: v_and_b32_e32 v1, v1, v5 102; GCN-NEXT: v_and_b32_e32 v0, v0, v4 103; GCN-NEXT: v_not_b32_e32 v1, v1 104; GCN-NEXT: v_not_b32_e32 v0, v0 105; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 106; GCN-NEXT: s_endpgm 107bb: 108 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 109 %i1 = zext i32 %i to i64 110 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1 111 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32 112 %i4 = extractelement <3 x i64> %i3, i64 0 113 %i5 = extractelement <3 x i64> %i3, i64 1 114 %i6 = extractelement <3 x i64> %i3, i64 2 115 %i7 = and i64 %i5, %i4 116 %i8 = and i64 %i7, %i6 117 %i9 = xor i64 %i8, -1 118 store i64 %i9, ptr addrspace(1) %i2, align 32 119 ret void 120} 121 122define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) { 123; GCN-LABEL: divergent_xor3_b32: 124; GCN: ; %bb.0: ; %bb 125; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 126; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0 127; GCN-NEXT: s_waitcnt lgkmcnt(0) 128; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1] 129; GCN-NEXT: s_waitcnt vmcnt(0) 130; GCN-NEXT: v_xor_b32_e32 v0, v1, v0 131; GCN-NEXT: v_xnor_b32_e32 v0, v0, v2 132; GCN-NEXT: global_store_dword v3, v0, s[0:1] 133; GCN-NEXT: s_endpgm 134bb: 135 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 136 %i1 = zext i32 %i to i64 137 %i2 = getelementptr inbounds <3 x i32>, ptr addrspace(1) %arg, i64 %i1 138 %i3 = load <3 x i32>, ptr addrspace(1) %i2, align 16 139 %i4 = extractelement <3 x i32> %i3, i64 0 140 %i5 = extractelement <3 x i32> %i3, i64 1 141 %i6 = extractelement <3 x i32> %i3, i64 2 142 %i7 = xor i32 %i5, %i4 143 %i8 = xor i32 %i7, %i6 144 %i9 = xor i32 %i8, -1 145 store i32 %i9, ptr addrspace(1) %i2, align 16 146 ret void 147} 148 149define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) { 150; GCN-LABEL: divergent_xor3_b64: 151; GCN: ; %bb.0: ; %bb 152; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 153; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0 154; GCN-NEXT: s_waitcnt lgkmcnt(0) 155; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1] 156; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16 157; GCN-NEXT: s_waitcnt vmcnt(1) 158; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 159; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 160; GCN-NEXT: s_waitcnt vmcnt(0) 161; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 162; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 163; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 164; GCN-NEXT: s_endpgm 165bb: 166 %i = tail call i32 @llvm.amdgcn.workitem.id.x() 167 %i1 = zext i32 %i to i64 168 %i2 = getelementptr inbounds <3 x i64>, ptr addrspace(1) %arg, i64 %i1 169 %i3 = load <3 x i64>, ptr addrspace(1) %i2, align 32 170 %i4 = extractelement <3 x i64> %i3, i64 0 171 %i5 = extractelement <3 x i64> %i3, i64 1 172 %i6 = extractelement <3 x i64> %i3, i64 2 173 %i7 = xor i64 %i5, %i4 174 %i8 = xor i64 %i7, %i6 175 %i9 = xor i64 %i8, -1 176 store i64 %i9, ptr addrspace(1) %i2, align 32 177 ret void 178} 179 180define amdgpu_kernel void @uniform_or3_b32(ptr addrspace(1) %arg) { 181; GCN-LABEL: uniform_or3_b32: 182; GCN: ; %bb.0: ; %bb 183; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 184; GCN-NEXT: v_mov_b32_e32 v0, 0 185; GCN-NEXT: s_waitcnt lgkmcnt(0) 186; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 187; GCN-NEXT: s_waitcnt lgkmcnt(0) 188; GCN-NEXT: s_or_b32 s0, s1, s0 189; GCN-NEXT: s_nor_b32 s0, s0, s2 190; GCN-NEXT: v_mov_b32_e32 v1, s0 191; GCN-NEXT: global_store_dword v0, v1, s[6:7] 192; GCN-NEXT: s_endpgm 193bb: 194 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 195 %i4 = extractelement <3 x i32> %i3, i64 0 196 %i5 = extractelement <3 x i32> %i3, i64 1 197 %i6 = extractelement <3 x i32> %i3, i64 2 198 %i7 = or i32 %i5, %i4 199 %i8 = or i32 %i7, %i6 200 %i9 = xor i32 %i8, -1 201 store i32 %i9, ptr addrspace(1) %arg, align 16 202 ret void 203} 204 205define amdgpu_kernel void @uniform_or3_b64(ptr addrspace(1) %arg) { 206; GCN-LABEL: uniform_or3_b64: 207; GCN: ; %bb.0: ; %bb 208; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 209; GCN-NEXT: v_mov_b32_e32 v2, 0 210; GCN-NEXT: s_waitcnt lgkmcnt(0) 211; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 212; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 213; GCN-NEXT: s_waitcnt lgkmcnt(0) 214; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] 215; GCN-NEXT: s_nor_b64 s[0:1], s[0:1], s[4:5] 216; GCN-NEXT: v_mov_b32_e32 v0, s0 217; GCN-NEXT: v_mov_b32_e32 v1, s1 218; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 219; GCN-NEXT: s_endpgm 220bb: 221 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 222 %i4 = extractelement <3 x i64> %i3, i64 0 223 %i5 = extractelement <3 x i64> %i3, i64 1 224 %i6 = extractelement <3 x i64> %i3, i64 2 225 %i7 = or i64 %i5, %i4 226 %i8 = or i64 %i7, %i6 227 %i9 = xor i64 %i8, -1 228 store i64 %i9, ptr addrspace(1) %arg, align 32 229 ret void 230} 231 232define amdgpu_kernel void @uniform_and3_b32(ptr addrspace(1) %arg) { 233; GCN-LABEL: uniform_and3_b32: 234; GCN: ; %bb.0: ; %bb 235; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 236; GCN-NEXT: v_mov_b32_e32 v0, 0 237; GCN-NEXT: s_waitcnt lgkmcnt(0) 238; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 239; GCN-NEXT: s_waitcnt lgkmcnt(0) 240; GCN-NEXT: s_and_b32 s0, s1, s0 241; GCN-NEXT: s_nand_b32 s0, s0, s2 242; GCN-NEXT: v_mov_b32_e32 v1, s0 243; GCN-NEXT: global_store_dword v0, v1, s[6:7] 244; GCN-NEXT: s_endpgm 245bb: 246 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 247 %i4 = extractelement <3 x i32> %i3, i64 0 248 %i5 = extractelement <3 x i32> %i3, i64 1 249 %i6 = extractelement <3 x i32> %i3, i64 2 250 %i7 = and i32 %i5, %i4 251 %i8 = and i32 %i7, %i6 252 %i9 = xor i32 %i8, -1 253 store i32 %i9, ptr addrspace(1) %arg, align 16 254 ret void 255} 256 257define amdgpu_kernel void @uniform_and3_b64(ptr addrspace(1) %arg) { 258; GCN-LABEL: uniform_and3_b64: 259; GCN: ; %bb.0: ; %bb 260; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 261; GCN-NEXT: v_mov_b32_e32 v2, 0 262; GCN-NEXT: s_waitcnt lgkmcnt(0) 263; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 264; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 265; GCN-NEXT: s_waitcnt lgkmcnt(0) 266; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] 267; GCN-NEXT: s_nand_b64 s[0:1], s[0:1], s[4:5] 268; GCN-NEXT: v_mov_b32_e32 v0, s0 269; GCN-NEXT: v_mov_b32_e32 v1, s1 270; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 271; GCN-NEXT: s_endpgm 272bb: 273 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 274 %i4 = extractelement <3 x i64> %i3, i64 0 275 %i5 = extractelement <3 x i64> %i3, i64 1 276 %i6 = extractelement <3 x i64> %i3, i64 2 277 %i7 = and i64 %i5, %i4 278 %i8 = and i64 %i7, %i6 279 %i9 = xor i64 %i8, -1 280 store i64 %i9, ptr addrspace(1) %arg, align 32 281 ret void 282} 283 284define amdgpu_kernel void @uniform_xor3_b32(ptr addrspace(1) %arg) { 285; GCN-LABEL: uniform_xor3_b32: 286; GCN: ; %bb.0: ; %bb 287; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 288; GCN-NEXT: v_mov_b32_e32 v0, 0 289; GCN-NEXT: s_waitcnt lgkmcnt(0) 290; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 291; GCN-NEXT: s_waitcnt lgkmcnt(0) 292; GCN-NEXT: s_xor_b32 s0, s1, s0 293; GCN-NEXT: s_xnor_b32 s0, s0, s2 294; GCN-NEXT: v_mov_b32_e32 v1, s0 295; GCN-NEXT: global_store_dword v0, v1, s[6:7] 296; GCN-NEXT: s_endpgm 297bb: 298 %i3 = load <3 x i32>, ptr addrspace(1) %arg, align 16 299 %i4 = extractelement <3 x i32> %i3, i64 0 300 %i5 = extractelement <3 x i32> %i3, i64 1 301 %i6 = extractelement <3 x i32> %i3, i64 2 302 %i7 = xor i32 %i5, %i4 303 %i8 = xor i32 %i7, %i6 304 %i9 = xor i32 %i8, -1 305 store i32 %i9, ptr addrspace(1) %arg, align 16 306 ret void 307} 308 309define amdgpu_kernel void @uniform_xor3_b64(ptr addrspace(1) %arg) { 310; GCN-LABEL: uniform_xor3_b64: 311; GCN: ; %bb.0: ; %bb 312; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 313; GCN-NEXT: v_mov_b32_e32 v2, 0 314; GCN-NEXT: s_waitcnt lgkmcnt(0) 315; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 316; GCN-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 317; GCN-NEXT: s_waitcnt lgkmcnt(0) 318; GCN-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] 319; GCN-NEXT: s_xnor_b64 s[0:1], s[0:1], s[4:5] 320; GCN-NEXT: v_mov_b32_e32 v0, s0 321; GCN-NEXT: v_mov_b32_e32 v1, s1 322; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 323; GCN-NEXT: s_endpgm 324bb: 325 %i3 = load <3 x i64>, ptr addrspace(1) %arg, align 32 326 %i4 = extractelement <3 x i64> %i3, i64 0 327 %i5 = extractelement <3 x i64> %i3, i64 1 328 %i6 = extractelement <3 x i64> %i3, i64 2 329 %i7 = xor i64 %i5, %i4 330 %i8 = xor i64 %i7, %i6 331 %i9 = xor i64 %i8, -1 332 store i64 %i9, ptr addrspace(1) %arg, align 32 333 ret void 334} 335 336declare i32 @llvm.amdgcn.workitem.id.x() 337