1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -early-live-intervals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 4 5define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { 6; GCN-LABEL: set_inactive: 7; GCN: ; %bb.0: 8; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 9; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 10; GCN-NEXT: s_mov_b32 s3, 0xf000 11; GCN-NEXT: s_mov_b32 s2, -1 12; GCN-NEXT: s_waitcnt lgkmcnt(0) 13; GCN-NEXT: v_mov_b32_e32 v1, s6 14; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 15; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5] 16; GCN-NEXT: s_mov_b64 exec, s[4:5] 17; GCN-NEXT: v_mov_b32_e32 v1, v0 18; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 19; GCN-NEXT: s_endpgm 20 %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 21 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) 22 store i32 %tmp, ptr addrspace(1) %out 23 ret void 24} 25 26define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) { 27; GCN-LABEL: set_inactive_imm_poison: 28; GCN: ; %bb.0: 29; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 30; GCN-NEXT: v_mov_b32_e32 v0, 1 31; GCN-NEXT: s_mov_b32 s3, 0xf000 32; GCN-NEXT: s_mov_b32 s2, -1 33; GCN-NEXT: v_mov_b32_e32 v1, v0 34; GCN-NEXT: s_waitcnt lgkmcnt(0) 35; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 36; GCN-NEXT: s_endpgm 37 %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0 38 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) 39 store i32 %tmp, ptr addrspace(1) %out 40 ret void 41} 42 43define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) { 44; GCN-LABEL: set_inactive_64: 45; GCN: ; %bb.0: 46; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 47; GCN-NEXT: s_mov_b32 s7, 0xf000 48; GCN-NEXT: s_mov_b32 s6, -1 49; GCN-NEXT: s_waitcnt lgkmcnt(0) 50; GCN-NEXT: s_mov_b32 s4, s0 51; GCN-NEXT: s_mov_b32 s5, s1 52; GCN-NEXT: v_mov_b32_e32 v2, s3 53; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 54; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] 55; GCN-NEXT: s_mov_b64 exec, s[0:1] 56; GCN-NEXT: v_mov_b32_e32 v2, s2 57; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 58; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] 59; GCN-NEXT: s_mov_b64 exec, s[0:1] 60; GCN-NEXT: v_mov_b32_e32 v2, v0 61; GCN-NEXT: v_mov_b32_e32 v3, v1 62; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 63; GCN-NEXT: s_endpgm 64 %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0 65 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) 66 store i64 %tmp, ptr addrspace(1) %out 67 ret void 68} 69 70define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) { 71; GCN-LABEL: set_inactive_imm_poison_64: 72; GCN: ; %bb.0: 73; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 74; GCN-NEXT: v_mov_b32_e32 v0, 1 75; GCN-NEXT: v_mov_b32_e32 v1, 0 76; GCN-NEXT: v_mov_b32_e32 v2, v0 77; GCN-NEXT: s_mov_b32 s3, 0xf000 78; GCN-NEXT: s_mov_b32 s2, -1 79; GCN-NEXT: v_mov_b32_e32 v3, v1 80; GCN-NEXT: s_waitcnt lgkmcnt(0) 81; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0 82; GCN-NEXT: s_endpgm 83 %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0 84 %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0) 85 store i64 %tmp, ptr addrspace(1) %out 86 ret void 87} 88 89define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x i32> inreg %desc) { 90; GCN-LABEL: set_inactive_scc: 91; GCN: ; %bb.0: 92; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 93; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 94; GCN-NEXT: s_waitcnt lgkmcnt(0) 95; GCN-NEXT: s_buffer_load_dword s7, s[0:3], 0x0 96; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 97; GCN-NEXT: v_mov_b32_e32 v1, s6 98; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 99; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3] 100; GCN-NEXT: s_mov_b64 exec, s[2:3] 101; GCN-NEXT: s_waitcnt lgkmcnt(0) 102; GCN-NEXT: s_cmp_lg_u32 s7, 56 103; GCN-NEXT: v_mov_b32_e32 v1, v0 104; GCN-NEXT: s_mov_b64 s[2:3], -1 105; GCN-NEXT: s_cbranch_scc1 .LBB4_3 106; GCN-NEXT: ; %bb.1: ; %Flow 107; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3] 108; GCN-NEXT: s_cbranch_vccz .LBB4_4 109; GCN-NEXT: .LBB4_2: ; %.exit 110; GCN-NEXT: s_endpgm 111; GCN-NEXT: .LBB4_3: ; %.one 112; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1 113; GCN-NEXT: s_mov_b32 s3, 0xf000 114; GCN-NEXT: s_mov_b32 s2, -1 115; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 116; GCN-NEXT: s_cbranch_execnz .LBB4_2 117; GCN-NEXT: .LBB4_4: ; %.zero 118; GCN-NEXT: s_mov_b32 s3, 0xf000 119; GCN-NEXT: s_mov_b32 s2, -1 120; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 121; GCN-NEXT: s_endpgm 122 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0) 123 %cmp = icmp eq i32 %val, 56 124 %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0 125 %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0) 126 br i1 %cmp, label %.zero, label %.one 127 128.zero: 129 store i32 %tmp, ptr addrspace(1) %out 130 br label %.exit 131 132.one: 133 %tmp.1 = add i32 %tmp, 1 134 store i32 %tmp.1, ptr addrspace(1) %out 135 br label %.exit 136 137.exit: 138 ret void 139} 140 141define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) { 142; GCN-LABEL: set_inactive_f32: 143; GCN: ; %bb.0: 144; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 145; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 146; GCN-NEXT: s_mov_b32 s3, 0xf000 147; GCN-NEXT: s_mov_b32 s2, -1 148; GCN-NEXT: s_waitcnt lgkmcnt(0) 149; GCN-NEXT: v_mov_b32_e32 v1, s6 150; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 151; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000 152; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 153; GCN-NEXT: s_mov_b64 exec, s[4:5] 154; GCN-NEXT: v_mov_b32_e32 v1, v0 155; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 156; GCN-NEXT: s_endpgm 157 %tmp.0 = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0 158 %tmp = call float @llvm.amdgcn.strict.wwm.f32(float %tmp.0) 159 store float %tmp, ptr addrspace(1) %out 160 ret void 161} 162 163define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) { 164; GCN-LABEL: set_inactive_f64: 165; GCN: ; %bb.0: 166; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 167; GCN-NEXT: s_mov_b32 s7, 0xf000 168; GCN-NEXT: s_mov_b32 s6, -1 169; GCN-NEXT: s_waitcnt lgkmcnt(0) 170; GCN-NEXT: s_mov_b32 s4, s0 171; GCN-NEXT: s_mov_b32 s5, s1 172; GCN-NEXT: v_mov_b32_e32 v2, s3 173; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 174; GCN-NEXT: v_mov_b32_e32 v0, 0x4010cccc 175; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] 176; GCN-NEXT: s_mov_b64 exec, s[0:1] 177; GCN-NEXT: v_mov_b32_e32 v2, s2 178; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 179; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd 180; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 181; GCN-NEXT: s_mov_b64 exec, s[0:1] 182; GCN-NEXT: v_mov_b32_e32 v2, v0 183; GCN-NEXT: v_mov_b32_e32 v3, v1 184; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 185; GCN-NEXT: s_endpgm 186 %tmp.0 = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0 187 %tmp = call double @llvm.amdgcn.strict.wwm.f64(double %tmp.0) 188 store double %tmp, ptr addrspace(1) %out 189 ret void 190} 191 192define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) { 193; GCN-LABEL: set_inactive_v2i16: 194; GCN: ; %bb.0: 195; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 196; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 197; GCN-NEXT: s_mov_b32 s3, 0xf000 198; GCN-NEXT: s_mov_b32 s2, -1 199; GCN-NEXT: s_waitcnt lgkmcnt(0) 200; GCN-NEXT: v_mov_b32_e32 v1, s6 201; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 202; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 203; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 204; GCN-NEXT: s_mov_b64 exec, s[4:5] 205; GCN-NEXT: v_mov_b32_e32 v1, v0 206; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 207; GCN-NEXT: s_endpgm 208 %tmp.0 = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0 209 %tmp = call <2 x i16> @llvm.amdgcn.strict.wwm.v2i16(<2 x i16> %tmp.0) 210 store <2 x i16> %tmp, ptr addrspace(1) %out 211 ret void 212} 213 214define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) { 215; GCN-LABEL: set_inactive_v2f16: 216; GCN: ; %bb.0: 217; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 218; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 219; GCN-NEXT: s_mov_b32 s3, 0xf000 220; GCN-NEXT: s_mov_b32 s2, -1 221; GCN-NEXT: s_waitcnt lgkmcnt(0) 222; GCN-NEXT: v_mov_b32_e32 v1, s6 223; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 224; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 225; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 226; GCN-NEXT: s_mov_b64 exec, s[4:5] 227; GCN-NEXT: v_mov_b32_e32 v1, v0 228; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 229; GCN-NEXT: s_endpgm 230 %tmp.0 = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0 231 %tmp = call <2 x half> @llvm.amdgcn.strict.wwm.v2i16(<2 x half> %tmp.0) 232 store <2 x half> %tmp, ptr addrspace(1) %out 233 ret void 234} 235 236define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { 237; GCN-LABEL: set_inactive_v2i32: 238; GCN: ; %bb.0: 239; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 240; GCN-NEXT: s_mov_b32 s7, 0xf000 241; GCN-NEXT: s_mov_b32 s6, -1 242; GCN-NEXT: s_waitcnt lgkmcnt(0) 243; GCN-NEXT: s_mov_b32 s4, s0 244; GCN-NEXT: s_mov_b32 s5, s1 245; GCN-NEXT: v_mov_b32_e32 v2, s3 246; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 247; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v2, s[0:1] 248; GCN-NEXT: s_mov_b64 exec, s[0:1] 249; GCN-NEXT: v_mov_b32_e32 v2, s2 250; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 251; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[0:1] 252; GCN-NEXT: s_mov_b64 exec, s[0:1] 253; GCN-NEXT: v_mov_b32_e32 v2, v0 254; GCN-NEXT: v_mov_b32_e32 v3, v1 255; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 256; GCN-NEXT: s_endpgm 257 %tmp.0 = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>) #0 258 %tmp = call <2 x i32> @llvm.amdgcn.strict.wwm.v2i32(<2 x i32> %tmp.0) 259 store <2 x i32> %tmp, ptr addrspace(1) %out 260 ret void 261} 262 263define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { 264; GCN-LABEL: set_inactive_v2f32: 265; GCN: ; %bb.0: 266; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 267; GCN-NEXT: s_mov_b32 s7, 0xf000 268; GCN-NEXT: s_mov_b32 s6, -1 269; GCN-NEXT: s_waitcnt lgkmcnt(0) 270; GCN-NEXT: s_mov_b32 s4, s0 271; GCN-NEXT: s_mov_b32 s5, s1 272; GCN-NEXT: v_mov_b32_e32 v2, s3 273; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 274; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1] 275; GCN-NEXT: s_mov_b64 exec, s[0:1] 276; GCN-NEXT: v_mov_b32_e32 v2, s2 277; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 278; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[0:1] 279; GCN-NEXT: s_mov_b64 exec, s[0:1] 280; GCN-NEXT: v_mov_b32_e32 v2, v0 281; GCN-NEXT: v_mov_b32_e32 v3, v1 282; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 283; GCN-NEXT: s_endpgm 284 %tmp.0 = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> <float 1.0, float 1.0>) #0 285 %tmp = call <2 x float> @llvm.amdgcn.strict.wwm.v2f32(<2 x float> %tmp.0) 286 store <2 x float> %tmp, ptr addrspace(1) %out 287 ret void 288} 289 290define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) { 291; GCN-LABEL: set_inactive_v2bf16: 292; GCN: ; %bb.0: 293; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 294; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 295; GCN-NEXT: s_mov_b32 s3, 0xf000 296; GCN-NEXT: s_mov_b32 s2, -1 297; GCN-NEXT: s_waitcnt lgkmcnt(0) 298; GCN-NEXT: v_mov_b32_e32 v1, s6 299; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 300; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 301; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] 302; GCN-NEXT: s_mov_b64 exec, s[4:5] 303; GCN-NEXT: v_mov_b32_e32 v1, v0 304; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 305; GCN-NEXT: s_endpgm 306 %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0 307 %tmp = call <2 x bfloat> @llvm.amdgcn.strict.wwm.v2bf16(<2 x bfloat> %tmp.0) 308 store <2 x bfloat> %tmp, ptr addrspace(1) %out 309 ret void 310} 311 312define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { 313; GCN-LABEL: set_inactive_v4i16: 314; GCN: ; %bb.0: 315; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 316; GCN-NEXT: s_mov_b32 s7, 0xf000 317; GCN-NEXT: s_mov_b32 s6, -1 318; GCN-NEXT: s_waitcnt lgkmcnt(0) 319; GCN-NEXT: s_mov_b32 s4, s0 320; GCN-NEXT: s_mov_b32 s5, s1 321; GCN-NEXT: v_mov_b32_e32 v2, s3 322; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 323; GCN-NEXT: v_mov_b32_e32 v0, 0x10001 324; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] 325; GCN-NEXT: s_mov_b64 exec, s[0:1] 326; GCN-NEXT: v_mov_b32_e32 v2, s2 327; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 328; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 329; GCN-NEXT: s_mov_b64 exec, s[0:1] 330; GCN-NEXT: v_mov_b32_e32 v2, v0 331; GCN-NEXT: v_mov_b32_e32 v3, v1 332; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 333; GCN-NEXT: s_endpgm 334 %tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0 335 %tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0) 336 store <4 x i16> %tmp, ptr addrspace(1) %out 337 ret void 338} 339 340define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { 341; GCN-LABEL: set_inactive_v4f16: 342; GCN: ; %bb.0: 343; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 344; GCN-NEXT: s_mov_b32 s7, 0xf000 345; GCN-NEXT: s_mov_b32 s6, -1 346; GCN-NEXT: s_waitcnt lgkmcnt(0) 347; GCN-NEXT: s_mov_b32 s4, s0 348; GCN-NEXT: s_mov_b32 s5, s1 349; GCN-NEXT: v_mov_b32_e32 v2, s3 350; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 351; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00 352; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] 353; GCN-NEXT: s_mov_b64 exec, s[0:1] 354; GCN-NEXT: v_mov_b32_e32 v2, s2 355; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 356; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 357; GCN-NEXT: s_mov_b64 exec, s[0:1] 358; GCN-NEXT: v_mov_b32_e32 v2, v0 359; GCN-NEXT: v_mov_b32_e32 v3, v1 360; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 361; GCN-NEXT: s_endpgm 362 %tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0 363 %tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0) 364 store <4 x half> %tmp, ptr addrspace(1) %out 365 ret void 366} 367 368define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { 369; GCN-LABEL: set_inactive_v4bf16: 370; GCN: ; %bb.0: 371; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 372; GCN-NEXT: s_mov_b32 s7, 0xf000 373; GCN-NEXT: s_mov_b32 s6, -1 374; GCN-NEXT: s_waitcnt lgkmcnt(0) 375; GCN-NEXT: s_mov_b32 s4, s0 376; GCN-NEXT: s_mov_b32 s5, s1 377; GCN-NEXT: v_mov_b32_e32 v2, s3 378; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 379; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 380; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] 381; GCN-NEXT: s_mov_b64 exec, s[0:1] 382; GCN-NEXT: v_mov_b32_e32 v2, s2 383; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 384; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 385; GCN-NEXT: s_mov_b64 exec, s[0:1] 386; GCN-NEXT: v_mov_b32_e32 v2, v0 387; GCN-NEXT: v_mov_b32_e32 v3, v1 388; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 389; GCN-NEXT: s_endpgm 390 %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0 391 %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) 392 store <4 x bfloat> %tmp, ptr addrspace(1) %out 393 ret void 394} 395 396define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) { 397; GCN-LABEL: set_inactive_p0: 398; GCN: ; %bb.0: 399; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 400; GCN-NEXT: s_mov_b32 s7, 0xf000 401; GCN-NEXT: s_mov_b32 s6, -1 402; GCN-NEXT: s_waitcnt lgkmcnt(0) 403; GCN-NEXT: s_mov_b32 s4, s0 404; GCN-NEXT: s_mov_b32 s5, s1 405; GCN-NEXT: v_mov_b32_e32 v2, s3 406; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 407; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1] 408; GCN-NEXT: s_mov_b64 exec, s[0:1] 409; GCN-NEXT: v_mov_b32_e32 v2, s2 410; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 411; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1] 412; GCN-NEXT: s_mov_b64 exec, s[0:1] 413; GCN-NEXT: v_mov_b32_e32 v2, v0 414; GCN-NEXT: v_mov_b32_e32 v3, v1 415; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 416; GCN-NEXT: s_endpgm 417 %tmp.0 = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0 418 %tmp = call ptr @llvm.amdgcn.strict.wwm.p0(ptr %tmp.0) 419 store ptr %tmp, ptr addrspace(1) %out 420 ret void 421} 422 423define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) { 424; GCN-LABEL: set_inactive_p2: 425; GCN: ; %bb.0: 426; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 427; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 428; GCN-NEXT: s_mov_b32 s3, 0xf000 429; GCN-NEXT: s_mov_b32 s2, -1 430; GCN-NEXT: s_waitcnt lgkmcnt(0) 431; GCN-NEXT: v_mov_b32_e32 v1, s6 432; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 433; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] 434; GCN-NEXT: s_mov_b64 exec, s[4:5] 435; GCN-NEXT: v_mov_b32_e32 v1, v0 436; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 437; GCN-NEXT: s_endpgm 438 %tmp.0 = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0 439 %tmp = call ptr addrspace(2) @llvm.amdgcn.strict.wwm.p2(ptr addrspace(2) %tmp.0) 440 store ptr addrspace(2) %tmp, ptr addrspace(1) %out 441 ret void 442} 443 444define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) { 445; GCN-LABEL: set_inactive_p3: 446; GCN: ; %bb.0: 447; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 448; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 449; GCN-NEXT: s_mov_b32 s3, 0xf000 450; GCN-NEXT: s_mov_b32 s2, -1 451; GCN-NEXT: s_waitcnt lgkmcnt(0) 452; GCN-NEXT: v_mov_b32_e32 v1, s6 453; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 454; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] 455; GCN-NEXT: s_mov_b64 exec, s[4:5] 456; GCN-NEXT: v_mov_b32_e32 v1, v0 457; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 458; GCN-NEXT: s_endpgm 459 %tmp.0 = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0 460 %tmp = call ptr addrspace(3) @llvm.amdgcn.strict.wwm.p3(ptr addrspace(3) %tmp.0) 461 store ptr addrspace(3) %tmp, ptr addrspace(1) %out 462 ret void 463} 464 465define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) { 466; GCN-LABEL: set_inactive_p5: 467; GCN: ; %bb.0: 468; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 469; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 470; GCN-NEXT: s_mov_b32 s3, 0xf000 471; GCN-NEXT: s_mov_b32 s2, -1 472; GCN-NEXT: s_waitcnt lgkmcnt(0) 473; GCN-NEXT: v_mov_b32_e32 v1, s6 474; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 475; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] 476; GCN-NEXT: s_mov_b64 exec, s[4:5] 477; GCN-NEXT: v_mov_b32_e32 v1, v0 478; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 479; GCN-NEXT: s_endpgm 480 %tmp.0 = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0 481 %tmp = call ptr addrspace(5) @llvm.amdgcn.strict.wwm.p5(ptr addrspace(5) %tmp.0) 482 store ptr addrspace(5) %tmp, ptr addrspace(1) %out 483 ret void 484} 485 486define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) { 487; GCN-LABEL: set_inactive_p6: 488; GCN: ; %bb.0: 489; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c 490; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 491; GCN-NEXT: s_mov_b32 s3, 0xf000 492; GCN-NEXT: s_mov_b32 s2, -1 493; GCN-NEXT: s_waitcnt lgkmcnt(0) 494; GCN-NEXT: v_mov_b32_e32 v1, s6 495; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 496; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5] 497; GCN-NEXT: s_mov_b64 exec, s[4:5] 498; GCN-NEXT: v_mov_b32_e32 v1, v0 499; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 500; GCN-NEXT: s_endpgm 501 %tmp.0 = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0 502 %tmp = call ptr addrspace(6) @llvm.amdgcn.strict.wwm.p6(ptr addrspace(6) %tmp.0) 503 store ptr addrspace(6) %tmp, ptr addrspace(1) %out 504 ret void 505} 506 507declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0 508declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0 509declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1 510declare i64 @llvm.amdgcn.strict.wwm.i64(i64) #1 511declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32) 512 513attributes #0 = { convergent readnone } 514attributes #1 = { convergent nounwind readnone speculatable willreturn } 515