1; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK %s 3 4; CHECK-LABEL: {{^}}inline_asm: 5; CHECK: s_endpgm 6; CHECK: s_endpgm 7define amdgpu_kernel void @inline_asm(ptr addrspace(1) %out) { 8entry: 9 store i32 5, ptr addrspace(1) %out 10 call void asm sideeffect "s_endpgm", ""() 11 ret void 12} 13 14; CHECK-LABEL: {{^}}inline_asm_shader: 15; CHECK: s_endpgm 16; CHECK: s_endpgm 17define amdgpu_ps void @inline_asm_shader() { 18entry: 19 call void asm sideeffect "s_endpgm", ""() 20 ret void 21} 22 23 24; CHECK-LABEL: {{^}}branch_on_asm_vgpr: 25; Make sure VGPR inline assembly is treated as divergent. 26; CHECK: v_mov_b32 v{{[0-9]+}}, 0 27; CHECK: v_cmp_eq_u32 28; CHECK: s_and_saveexec_b64 29define amdgpu_kernel void @branch_on_asm_vgpr(ptr addrspace(1) %out) { 30 %zero = call i32 asm "v_mov_b32 $0, 0", "=v"() 31 %cmp = icmp eq i32 %zero, 0 32 br i1 %cmp, label %if, label %endif 33 34if: 35 store i32 0, ptr addrspace(1) %out 36 br label %endif 37 38endif: 39 ret void 40} 41 42; CHECK-LABEL: {{^}}branch_on_asm_sgpr: 43; Make sure SGPR inline assembly is treated as uniform 44; CHECK: s_mov_b32 s{{[0-9]+}}, 0 45; CHECK: s_cmp_lg_u32 46; CHECK: s_cbranch_scc0 47define amdgpu_kernel void @branch_on_asm_sgpr(ptr addrspace(1) %out) { 48 %zero = call i32 asm "s_mov_b32 $0, 0", "=s"() 49 %cmp = icmp eq i32 %zero, 0 50 br i1 %cmp, label %if, label %endif 51 52if: 53 store i32 0, ptr addrspace(1) %out 54 br label %endif 55 56endif: 57 ret void 58} 59 60; CHECK-LABEL: {{^}}v_cmp_asm: 61; CHECK: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} 62; CHECK: v_cmp_ne_u32_e64 s[[[MASK_LO:[0-9]+]]:[[MASK_HI:[0-9]+]]], 0, [[SRC]] 63; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]] 64; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]] 65; CHECK: buffer_store_dwordx2 v[[[V_LO]]:[[V_HI]]] 66define amdgpu_kernel void @v_cmp_asm(ptr addrspace(1) %out, i32 %in) { 67 %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in) 68 store i64 %sgpr, ptr addrspace(1) %out 69 ret void 70} 71 72; CHECK-LABEL: {{^}}code_size_inline_asm: 73; CHECK: codeLenInByte = 12 74define amdgpu_kernel void @code_size_inline_asm(ptr addrspace(1) %out) { 75entry: 76 call void asm sideeffect "v_nop_e64", ""() 77 ret void 78} 79 80; All inlineasm instructions are assumed to be the maximum size 81; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst: 82; CHECK: codeLenInByte = 12 83define amdgpu_kernel void @code_size_inline_asm_small_inst(ptr addrspace(1) %out) { 84entry: 85 call void asm sideeffect "v_nop_e32", ""() 86 ret void 87} 88 89; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst: 90; CHECK: codeLenInByte = 20 91define amdgpu_kernel void @code_size_inline_asm_2_inst(ptr addrspace(1) %out) { 92entry: 93 call void asm sideeffect " 94 v_nop_e64 95 v_nop_e64 96 ", ""() 97 ret void 98} 99 100; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline: 101; CHECK: codeLenInByte = 20 102define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(ptr addrspace(1) %out) { 103entry: 104 call void asm sideeffect " 105 v_nop_e64 106 107 v_nop_e64 108 ", ""() 109 ret void 110} 111 112; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst: 113; CHECK: codeLenInByte = 4 114define amdgpu_kernel void @code_size_inline_asm_0_inst(ptr addrspace(1) %out) { 115entry: 116 call void asm sideeffect "", ""() 117 ret void 118} 119 120; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment: 121; CHECK: codeLenInByte = 4 122define amdgpu_kernel void @code_size_inline_asm_1_comment(ptr addrspace(1) %out) { 123entry: 124 call void asm sideeffect "; comment", ""() 125 ret void 126} 127 128; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment: 129; CHECK: codeLenInByte = 4 130define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(ptr addrspace(1) %out) { 131entry: 132 call void asm sideeffect " 133; comment", ""() 134 ret void 135} 136 137; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline: 138; CHECK: codeLenInByte = 4 139define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(ptr addrspace(1) %out) { 140entry: 141 call void asm sideeffect "; comment 142", ""() 143 ret void 144} 145 146; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line: 147; CHECK: codeLenInByte = 4 148define amdgpu_kernel void @code_size_inline_asm_2_comments_line(ptr addrspace(1) %out) { 149entry: 150 call void asm sideeffect "; first comment ; second comment", ""() 151 ret void 152} 153 154; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace: 155; CHECK: codeLenInByte = 4 156define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(ptr addrspace(1) %out) { 157entry: 158 call void asm sideeffect "; first comment;second comment", ""() 159 ret void 160} 161 162; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0: 163; CHECK: codeLenInByte = 20 164define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(ptr addrspace(1) %out) { 165entry: 166 call void asm sideeffect "; comment 167 v_nop_e64 ; inline comment 168; separate comment 169 v_nop_e64 170 171 ; trailing comment 172 ; extra comment 173 ", ""() 174 ret void 175} 176 177; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1: 178; CHECK: codeLenInByte = 20 179define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(ptr addrspace(1) %out) { 180entry: 181 call void asm sideeffect "v_nop_e64 ; inline comment 182; separate comment 183 v_nop_e64 184 185 ; trailing comment 186 ; extra comment 187 ", ""() 188 ret void 189} 190 191; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands: 192; CHECK: codeLenInByte = 20 193define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(ptr addrspace(1) %out) { 194entry: 195 call void asm sideeffect "; comment 196 v_add_i32_e32 v0, vcc, v1, v2 ; inline comment 197; separate comment 198 v_bfrev_b32_e32 v0, 1 199 200 ; trailing comment 201 ; extra comment 202 ", ""() 203 ret void 204} 205 206; FIXME: Should not have intermediate sgprs 207; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr: 208; CHECK: v_mov_b32_e32 v0, 0x1e240 209; CHECK: v_mov_b32_e32 v1, 0 210; CHECK: use v[0:1] 211define amdgpu_kernel void @i64_imm_input_phys_vgpr() { 212entry: 213 call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456) 214 ret void 215} 216 217; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr: 218; CHECK: v_mov_b32_e32 v0, 1{{$}} 219; CHECK: ; use v0 220define amdgpu_kernel void @i1_imm_input_phys_vgpr() { 221entry: 222 call void asm sideeffect "; use $0 ", "{v0}"(i1 true) 223 ret void 224} 225 226 227; FIXME: This behavior is nonsense. We should probably disallow i1 asm 228 229; CHECK-LABEL: {{^}}i1_input_phys_vgpr: 230; CHECK: {{buffer|flat}}_load_ubyte [[LOAD:v[0-9]+]] 231; CHECK-NOT: [[LOAD]] 232; CHECK: ; use v0 233; CHECK: v_and_b32_e32 [[STORE:v[0-9]+]], 1, v1 234; CHECK: {{buffer|flat}}_store_byte [[STORE]], 235define amdgpu_kernel void @i1_input_phys_vgpr() { 236entry: 237 %val = load i1, ptr addrspace(1) undef 238 %cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val) 239 store i1 %cc, ptr addrspace(1) undef 240 ret void 241} 242 243; FIXME: Should prodbably be masking high bits of load. 244; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2: 245; CHECK: buffer_load_ubyte v0 246; CHECK-NEXT: s_waitcnt 247; CHECK-NEXT: buffer_load_ubyte v1 248; CHECK-NEXT: s_waitcnt 249; CHECK-NEXT: ASMSTART 250define amdgpu_kernel void @i1_input_phys_vgpr_x2() { 251entry: 252 %val0 = load volatile i1, ptr addrspace(1) undef 253 %val1 = load volatile i1, ptr addrspace(1) undef 254 call void asm sideeffect "; use $0 $1 ", "{v0}, {v1}"(i1 %val0, i1 %val1) 255 ret void 256} 257 258; CHECK-LABEL: {{^}}muliple_def_phys_vgpr: 259; CHECK: ; def v0 260; CHECK: v_mov_b32_e32 v1, v0 261; CHECK: ; def v0 262; CHECK: v_lshlrev_b32_e32 v{{[0-9]+}}, v0, v1 263define amdgpu_kernel void @muliple_def_phys_vgpr() { 264entry: 265 %def0 = call i32 asm sideeffect "; def $0 ", "={v0}"() 266 %def1 = call i32 asm sideeffect "; def $0 ", "={v0}"() 267 %add = shl i32 %def0, %def1 268 store i32 %add, ptr addrspace(1) undef 269 ret void 270} 271 272; CHECK-LABEL: {{^}}asm_constraint_c_n: 273; CHECK: s_trap 10{{$}} 274define amdgpu_kernel void @asm_constraint_c_n() { 275entry: 276 tail call void asm sideeffect "s_trap ${0:c}", "n"(i32 10) #1 277 ret void 278} 279 280; CHECK-LABEL: {{^}}asm_constraint_n_n: 281; CHECK: s_trap -10{{$}} 282define amdgpu_kernel void @asm_constraint_n_n() { 283entry: 284 tail call void asm sideeffect "s_trap ${0:n}", "n"(i32 10) #1 285 ret void 286} 287 288; Make sure tuples of 3 SGPRs are printed with the [] syntax instead 289; of the tablegen default. 290; CHECK-LABEL: {{^}}sgpr96_name_format: 291; CHECK: ; sgpr96 s[0:2] 292define amdgpu_kernel void @sgpr96_name_format() { 293entry: 294 tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1 295 ret void 296} 297 298; Check aggregate types are handled properly. 299; CHECK-LABEL: mad_u64 300; CHECK: v_mad_u64_u32 301define void @mad_u64(i32 %x, i1 %c0) { 302entry: 303 br i1 %c0, label %exit, label %false 304 305false: 306 %s0 = tail call { i64, i64 } asm sideeffect "v_mad_u64_u32 $0, $1, $2, $3, $4", "=v,=s,v,v,v"(i32 -766435501, i32 %x, i64 0) 307 br label %exit 308 309exit: 310 %s1 = phi { i64, i64} [ undef, %entry ], [ %s0, %false] 311 %v0 = extractvalue { i64, i64 } %s1, 0 312 %v1 = extractvalue { i64, i64 } %s1, 1 313 tail call void asm sideeffect "; use $0", "v"(i64 %v0) 314 tail call void asm sideeffect "; use $0", "v"(i64 %v1) 315 ret void 316} 317 318; CHECK-LABEL: {{^}}scc_as_i32: 319; CHECK: ; def scc 320; CHECK: ; use scc 321define void @scc_as_i32() { 322 %scc = call i32 asm sideeffect "; def $0", "={scc}"() 323 call void asm sideeffect "; use $0 ", "{scc}"(i32 %scc) 324 ret void 325} 326 327; CHECK-LABEL: {{^}}scc_as_i1: 328; CHECK: ; def scc 329; CHECK: ; use scc 330define void @scc_as_i1() { 331 %scc = call i1 asm sideeffect "; def $0", "={scc}"() 332 call void asm sideeffect "; use $0 ", "{scc}"(i1 %scc) 333 ret void 334} 335 336; Make sure the SGPR def is treated as a uniform value when the inline 337; assembly also defines a divergent value. The add should be scalar 338; and not introduce illegal vgpr to sgpr copies. 339; CHECK-LABEL: {{^}}mixed_def_vgpr_sgpr_def_asm: 340; CHECK: ; def v0 s[4:5] 341; CHECK: s_add_u32 342; CHECK-NEXT: s_addc_u32 343; CHECK: ; use s[4:5] 344define void @mixed_def_vgpr_sgpr_def_asm() { 345 %vgpr_sgpr = call { i32, i64 } asm sideeffect "; def $0 $1 ", "=v,={s[4:5]}"() 346 %vgpr = extractvalue { i32, i64 } %vgpr_sgpr, 0 347 %sgpr = extractvalue { i32, i64 } %vgpr_sgpr, 1 348 %sgpr.add = add i64 %sgpr, 2 349 call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add) 350 ret void 351} 352 353; CHECK-LABEL: {{^}}mixed_def_sgpr_vgpr_def_asm: 354; CHECK: ; def s[4:5] v0 355; CHECK: s_add_u32 356; CHECK-NEXT: s_addc_u32 357; CHECK: ; use s[4:5] 358define void @mixed_def_sgpr_vgpr_def_asm() { 359 %sgpr_vgpr = call { i64, i32 } asm sideeffect "; def $0 $1 ", "={s[4:5]},=v"() 360 %sgpr = extractvalue { i64, i32 } %sgpr_vgpr, 0 361 %vgpr = extractvalue { i64, i32 } %sgpr_vgpr, 1 362 %sgpr.add = add i64 %sgpr, 2 363 call void asm sideeffect "; use $0 ", "{s[4:5]}"(i64 %sgpr.add) 364 ret void 365} 366