1; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s 2; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} 3 4target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" 5 6declare i16 @llvm.ctlz.i16(i16, i1) readnone 7declare i32 @llvm.ctlz.i32(i32, i1) readnone 8declare i64 @llvm.ctlz.i64(i64, i1) readnone 9 10; There should be no difference between llvm.ctlz.i32(%a, true) and 11; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0. 12 13; CHECK-LABEL: myctlz( 14define i32 @myctlz(i32 %a) { 15; CHECK: ld.param. 16; CHECK-NEXT: clz.b32 17; CHECK-NEXT: st.param. 18; CHECK-NEXT: ret; 19 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone 20 ret i32 %val 21} 22; CHECK-LABEL: myctlz_2( 23define i32 @myctlz_2(i32 %a) { 24; CHECK: ld.param. 25; CHECK-NEXT: clz.b32 26; CHECK-NEXT: st.param. 27; CHECK-NEXT: ret; 28 %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone 29 ret i32 %val 30} 31 32; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit 33; value, so here we have to zero-extend it. 34; CHECK-LABEL: myctlz64( 35define i64 @myctlz64(i64 %a) { 36; CHECK: ld.param. 37; CHECK-NEXT: clz.b64 38; CHECK-NEXT: cvt.u64.u32 39; CHECK-NEXT: st.param. 40; CHECK-NEXT: ret; 41 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 42 ret i64 %val 43} 44; CHECK-LABEL: myctlz64_2( 45define i64 @myctlz64_2(i64 %a) { 46; CHECK: ld.param. 47; CHECK-NEXT: clz.b64 48; CHECK-NEXT: cvt.u64.u32 49; CHECK-NEXT: st.param. 50; CHECK-NEXT: ret; 51 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone 52 ret i64 %val 53} 54 55; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the 56; natural return width of ptx's clz.b64 instruction. No conversions should be 57; necessary in the PTX. 58; CHECK-LABEL: myctlz64_as_32( 59define i32 @myctlz64_as_32(i64 %a) { 60; CHECK: ld.param. 61; CHECK-NEXT: clz.b64 62; CHECK-NEXT: st.param. 63; CHECK-NEXT: ret; 64 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 65 %trunc = trunc i64 %val to i32 66 ret i32 %trunc 67} 68; CHECK-LABEL: myctlz64_as_32_2( 69define i32 @myctlz64_as_32_2(i64 %a) { 70; CHECK: ld.param. 71; CHECK-NEXT: clz.b64 72; CHECK-NEXT: st.param. 73; CHECK-NEXT: ret; 74 %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone 75 %trunc = trunc i64 %val to i32 76 ret i32 %trunc 77} 78 79; ctlz.i16 is implemented by extending the input to i32, computing the result, 80; and then truncating the result back down to i16. But the NVPTX ABI 81; zero-extends i16 return values to i32, so the final truncation doesn't appear 82; in this function. 83; CHECK-LABEL: myctlz_ret16( 84define i16 @myctlz_ret16(i16 %a) { 85; CHECK: ld.param. 86; CHECK-NEXT: cvt.u32.u16 87; CHECK-NEXT: clz.b32 88; CHECK-NEXT: sub. 89; CHECK-NEXT: st.param. 90; CHECK-NEXT: ret; 91 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 92 ret i16 %val 93} 94; CHECK-LABEL: myctlz_ret16_2( 95define i16 @myctlz_ret16_2(i16 %a) { 96; CHECK: ld.param. 97; CHECK-NEXT: cvt.u32.u16 98; CHECK-NEXT: clz.b32 99; CHECK-NEXT: sub. 100; CHECK-NEXT: st.param. 101; CHECK-NEXT: ret; 102 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone 103 ret i16 %val 104} 105 106; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should 107; remain. 108; CHECK-LABEL: myctlz_store16( 109define void @myctlz_store16(i16 %a, ptr %b) { 110; CHECK: ld.param. 111; CHECK-NEXT: cvt.u32.u16 112; CHECK-NEXT: clz.b32 113; CHECK-DAG: cvt.u16.u32 114; CHECK-DAG: sub. 115; CHECK: st.{{[a-z]}}16 116; CHECK: ret; 117 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 118 store i16 %val, ptr %b 119 ret void 120} 121; CHECK-LABEL: myctlz_store16_2( 122define void @myctlz_store16_2(i16 %a, ptr %b) { 123; CHECK: ld.param. 124; CHECK-NEXT: cvt.u32.u16 125; CHECK-NEXT: clz.b32 126; CHECK-DAG: cvt.u16.u32 127; CHECK-DAG: sub. 128; CHECK: st.{{[a-z]}}16 129; CHECK: ret; 130 %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone 131 store i16 %val, ptr %b 132 ret void 133} 134