1; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 2; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 4; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 5; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 6; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 7 8; GCN-LABEL: {{^}}full_mask: 9; GCN: s_mov_b64 exec, -1 10; GCN: v_add_f32_e32 v0, 11define amdgpu_ps float @full_mask(float %a, float %b) { 12main_body: 13 %s = fadd float %a, %b 14 call void @llvm.amdgcn.init.exec(i64 -1) 15 ret float %s 16} 17 18; GCN-LABEL: {{^}}partial_mask: 19; GCN: s_mov_b64 exec, 0x1e240 20; GCN: v_add_f32_e32 v0, 21define amdgpu_ps float @partial_mask(float %a, float %b) { 22main_body: 23 %s = fadd float %a, %b 24 call void @llvm.amdgcn.init.exec(i64 123456) 25 ret float %s 26} 27 28; GCN-LABEL: {{^}}input_s3off8: 29; GCN: s_bfe_u32 s0, s3, 0x70008 30; GCN: s_bfm_b64 exec, s0, 0 31; GCN: s_cmp_eq_u32 s0, 64 32; GCN: s_cmov_b64 exec, -1 33; GCN: v_add_f32_e32 v0, 34define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { 35main_body: 36 %s = fadd float %a, %b 37 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 38 ret float %s 39} 40 41; GCN-LABEL: {{^}}input_s0off19: 42; GCN: s_bfe_u32 s0, s0, 0x70013 43; GCN: s_bfm_b64 exec, s0, 0 44; GCN: s_cmp_eq_u32 s0, 64 45; GCN: s_cmov_b64 exec, -1 46; GCN: v_add_f32_e32 v0, 47define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) { 48main_body: 49 %s = fadd float %a, %b 50 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 51 ret float %s 52} 53 54; GCN-LABEL: {{^}}reuse_input: 55; GCN: s_bfe_u32 s1, s0, 0x70013 56; GCN: s_bfm_b64 exec, s1, 0 57; GCN: s_cmp_eq_u32 s1, 64 58; GCN: s_cmov_b64 exec, -1 59; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 60define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { 61main_body: 62 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 63 %s = add i32 %a, %count 64 %f = sitofp i32 %s to float 65 ret float %f 66} 67 68; GCN-LABEL: {{^}}reuse_input2: 69; GCN: s_bfe_u32 s1, s0, 0x70013 70; GCN: s_bfm_b64 exec, s1, 0 71; GCN: s_cmp_eq_u32 s1, 64 72; GCN: s_cmov_b64 exec, -1 73; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 74define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { 75main_body: 76 %s = add i32 %a, %count 77 %f = sitofp i32 %s to float 78 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 79 ret float %f 80} 81 82; GCN-LABEL: {{^}}init_unreachable: 83; 84; This used to crash. 85define amdgpu_ps void @init_unreachable() { 86main_body: 87 call void @llvm.amdgcn.init.exec(i64 -1) 88 unreachable 89} 90 91; GCN-LABEL: {{^}}init_exec_before_frame_materialize: 92; GCN-NOT: {{^}}v_ 93; GCN: s_mov_b64 exec, -1 94; GCN: v_mov 95; GCN: v_add 96define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) { 97main_body: 98 %array0 = alloca [1024 x i32], align 16, addrspace(5) 99 %array1 = alloca [20 x i32], align 16, addrspace(5) 100 call void @llvm.amdgcn.init.exec(i64 -1) 101 102 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1 103 store i32 %a, ptr addrspace(5) %ptr0, align 4 104 105 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1 106 store i32 %a, ptr addrspace(5) %ptr1, align 4 107 108 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2 109 store i32 %b, ptr addrspace(5) %ptr2, align 4 110 111 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b 112 %v3 = load i32, ptr addrspace(5) %ptr3, align 4 113 114 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b 115 %v4 = load i32, ptr addrspace(5) %ptr4, align 4 116 117 %v5 = add i32 %v3, %v4 118 %v = bitcast i32 %v5 to float 119 ret float %v 120} 121 122; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize: 123; GCN-NOT: {{^}}v_ 124; GCN: s_bfe_u32 s2, s2, 0x70008 125; GCN-NEXT: s_bfm_b64 exec, s2, 0 126; GCN-NEXT: s_cmp_eq_u32 s2, 64 127; GCN-NEXT: s_cmov_b64 exec, -1 128; GCN: v_mov 129; GCN: v_add 130define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 131main_body: 132 %array0 = alloca [1024 x i32], align 16, addrspace(5) 133 %array1 = alloca [20 x i32], align 16, addrspace(5) 134 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 135 136 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1 137 store i32 %a, ptr addrspace(5) %ptr0, align 4 138 139 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1 140 store i32 %a, ptr addrspace(5) %ptr1, align 4 141 142 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2 143 store i32 %b, ptr addrspace(5) %ptr2, align 4 144 145 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b 146 %v3 = load i32, ptr addrspace(5) %ptr3, align 4 147 148 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b 149 %v4 = load i32, ptr addrspace(5) %ptr4, align 4 150 151 %v5 = add i32 %v3, %v4 152 %v = bitcast i32 %v5 to float 153 ret float %v 154} 155 156; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry: 157; GCN-NOT: {{^}}v_ 158; GCN: %endif 159; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008 160; GCN-NEXT: s_bfm_b64 exec, [[S]], 0 161; GCN-NEXT: s_cmp_eq_u32 [[S]], 64 162; GCN-NEXT: s_cmov_b64 exec, -1 163; GCN: v_mov 164; GCN: v_add 165define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 166main_body: 167 ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel 168 %array0 = alloca [1024 x i32], align 16, addrspace(5) 169 %array1 = alloca [20 x i32], align 16, addrspace(5) 170 171 %cc = icmp uge i32 %count, 32 172 br i1 %cc, label %endif, label %if 173 174if: 175 call void asm sideeffect "", ""() 176 br label %endif 177 178endif: 179 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 180 181 %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1 182 store i32 %a, ptr addrspace(5) %ptr0, align 4 183 184 %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1 185 store i32 %a, ptr addrspace(5) %ptr1, align 4 186 187 %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2 188 store i32 %b, ptr addrspace(5) %ptr2, align 4 189 190 %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b 191 %v3 = load i32, ptr addrspace(5) %ptr3, align 4 192 193 %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b 194 %v4 = load i32, ptr addrspace(5) %ptr4, align 4 195 196 %v5 = add i32 %v3, %v4 197 %v6 = add i32 %v5, %count 198 %v = bitcast i32 %v6 to float 199 ret float %v 200} 201 202declare void @llvm.amdgcn.init.exec(i64) #1 203declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 204 205attributes #1 = { convergent } 206