1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s 3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} 4 5; Check that invariant loads from the global addrspace are lowered to 6; ld.global.nc. 7 8define i32 @ld_global(ptr addrspace(1) %ptr) { 9; CHECK-LABEL: ld_global( 10; CHECK: { 11; CHECK-NEXT: .reg .b32 %r<2>; 12; CHECK-NEXT: .reg .b64 %rd<2>; 13; CHECK-EMPTY: 14; CHECK-NEXT: // %bb.0: 15; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_param_0]; 16; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; 17; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 18; CHECK-NEXT: ret; 19 %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0 20 ret i32 %a 21} 22 23define half @ld_global_v2f16(ptr addrspace(1) %ptr) { 24; Load of v2f16 is weird. We consider it to be a legal type, which happens to be 25; loaded/stored as a 32-bit scalar. 26; CHECK-LABEL: ld_global_v2f16( 27; CHECK: { 28; CHECK-NEXT: .reg .b16 %rs<4>; 29; CHECK-NEXT: .reg .b32 %r<2>; 30; CHECK-NEXT: .reg .f32 %f<4>; 31; CHECK-NEXT: .reg .b64 %rd<2>; 32; CHECK-EMPTY: 33; CHECK-NEXT: // %bb.0: 34; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0]; 35; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; 36; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 37; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; 38; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; 39; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; 40; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3; 41; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; 42; CHECK-NEXT: ret; 43 %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0 44 %v1 = extractelement <2 x half> %a, i32 0 45 %v2 = extractelement <2 x half> %a, i32 1 46 %sum = fadd half %v1, %v2 47 ret half %sum 48} 49 50define half @ld_global_v4f16(ptr addrspace(1) %ptr) { 51; Larger f16 vectors may be split into individual f16 elements and multiple 52; loads/stores may be vectorized using f16 element type. Practically it's 53; limited to v4 variant only. 54; CHECK-LABEL: ld_global_v4f16( 55; CHECK: { 56; CHECK-NEXT: .reg .b16 %rs<8>; 57; CHECK-NEXT: .reg .f32 %f<10>; 58; CHECK-NEXT: .reg .b64 %rd<2>; 59; CHECK-EMPTY: 60; CHECK-NEXT: // %bb.0: 61; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f16_param_0]; 62; CHECK-NEXT: ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 63; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; 64; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; 65; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; 66; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; 67; CHECK-NEXT: cvt.f32.f16 %f4, %rs4; 68; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; 69; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; 70; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; 71; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; 72; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; 73; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; 74; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; 75; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; 76; CHECK-NEXT: ret; 77 %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0 78 %v1 = extractelement <4 x half> %a, i32 0 79 %v2 = extractelement <4 x half> %a, i32 1 80 %v3 = extractelement <4 x half> %a, i32 2 81 %v4 = extractelement <4 x half> %a, i32 3 82 %sum1 = fadd half %v1, %v2 83 %sum2 = fadd half %v3, %v4 84 %sum = fadd half %sum1, %sum2 85 ret half %sum 86} 87 88define half @ld_global_v8f16(ptr addrspace(1) %ptr) { 89; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores, 90; so load/store vectorizer has to convert v8f16 -> v4 x v2f16. 91; CHECK-LABEL: ld_global_v8f16( 92; CHECK: { 93; CHECK-NEXT: .reg .b16 %rs<8>; 94; CHECK-NEXT: .reg .b32 %r<5>; 95; CHECK-NEXT: .reg .f32 %f<10>; 96; CHECK-NEXT: .reg .b64 %rd<2>; 97; CHECK-EMPTY: 98; CHECK-NEXT: // %bb.0: 99; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f16_param_0]; 100; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 101; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } 102; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; } 103; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; } 104; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; } 105; CHECK-NEXT: cvt.f32.f16 %f1, %rs4; 106; CHECK-NEXT: cvt.f32.f16 %f2, %rs3; 107; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; 108; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; 109; CHECK-NEXT: cvt.f32.f16 %f4, %rs2; 110; CHECK-NEXT: cvt.f32.f16 %f5, %rs1; 111; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; 112; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; 113; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; 114; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; 115; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; 116; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; 117; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; 118; CHECK-NEXT: ret; 119 %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0 120 %v1 = extractelement <8 x half> %a, i32 0 121 %v2 = extractelement <8 x half> %a, i32 2 122 %v3 = extractelement <8 x half> %a, i32 4 123 %v4 = extractelement <8 x half> %a, i32 6 124 %sum1 = fadd half %v1, %v2 125 %sum2 = fadd half %v3, %v4 126 %sum = fadd half %sum1, %sum2 127 ret half %sum 128} 129 130define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) { 131; CHECK-LABEL: ld_global_v8i8( 132; CHECK: { 133; CHECK-NEXT: .reg .b16 %rs<8>; 134; CHECK-NEXT: .reg .b32 %r<9>; 135; CHECK-NEXT: .reg .b64 %rd<2>; 136; CHECK-EMPTY: 137; CHECK-NEXT: // %bb.0: 138; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8i8_param_0]; 139; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; 140; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8; 141; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 142; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; 143; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; 144; CHECK-NEXT: bfe.u32 %r5, %r1, 16, 8; 145; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 146; CHECK-NEXT: bfe.u32 %r6, %r1, 0, 8; 147; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; 148; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3; 149; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1; 150; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; 151; CHECK-NEXT: cvt.u32.u16 %r7, %rs7; 152; CHECK-NEXT: and.b32 %r8, %r7, 255; 153; CHECK-NEXT: st.param.b32 [func_retval0], %r8; 154; CHECK-NEXT: ret; 155 %a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 156 %v1 = extractelement <8 x i8> %a, i32 0 157 %v2 = extractelement <8 x i8> %a, i32 2 158 %v3 = extractelement <8 x i8> %a, i32 4 159 %v4 = extractelement <8 x i8> %a, i32 6 160 %sum1 = add i8 %v1, %v2 161 %sum2 = add i8 %v3, %v4 162 %sum = add i8 %sum1, %sum2 163 ret i8 %sum 164} 165 166define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) { 167; CHECK-LABEL: ld_global_v16i8( 168; CHECK: { 169; CHECK-NEXT: .reg .b16 %rs<16>; 170; CHECK-NEXT: .reg .b32 %r<15>; 171; CHECK-NEXT: .reg .b64 %rd<2>; 172; CHECK-EMPTY: 173; CHECK-NEXT: // %bb.0: 174; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v16i8_param_0]; 175; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 176; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8; 177; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 178; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8; 179; CHECK-NEXT: cvt.u16.u32 %rs2, %r6; 180; CHECK-NEXT: bfe.u32 %r7, %r3, 16, 8; 181; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 182; CHECK-NEXT: bfe.u32 %r8, %r3, 0, 8; 183; CHECK-NEXT: cvt.u16.u32 %rs4, %r8; 184; CHECK-NEXT: bfe.u32 %r9, %r2, 16, 8; 185; CHECK-NEXT: cvt.u16.u32 %rs5, %r9; 186; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 187; CHECK-NEXT: cvt.u16.u32 %rs6, %r10; 188; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; 189; CHECK-NEXT: cvt.u16.u32 %rs7, %r11; 190; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; 191; CHECK-NEXT: cvt.u16.u32 %rs8, %r12; 192; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; 193; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5; 194; CHECK-NEXT: add.s16 %rs11, %rs4, %rs3; 195; CHECK-NEXT: add.s16 %rs12, %rs2, %rs1; 196; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; 197; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; 198; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; 199; CHECK-NEXT: cvt.u32.u16 %r13, %rs15; 200; CHECK-NEXT: and.b32 %r14, %r13, 255; 201; CHECK-NEXT: st.param.b32 [func_retval0], %r14; 202; CHECK-NEXT: ret; 203 %a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 204 %v1 = extractelement <16 x i8> %a, i32 0 205 %v2 = extractelement <16 x i8> %a, i32 2 206 %v3 = extractelement <16 x i8> %a, i32 4 207 %v4 = extractelement <16 x i8> %a, i32 6 208 %v5 = extractelement <16 x i8> %a, i32 8 209 %v6 = extractelement <16 x i8> %a, i32 10 210 %v7 = extractelement <16 x i8> %a, i32 12 211 %v8 = extractelement <16 x i8> %a, i32 14 212 %sum1 = add i8 %v1, %v2 213 %sum2 = add i8 %v3, %v4 214 %sum3 = add i8 %v5, %v6 215 %sum4 = add i8 %v7, %v8 216 %sum5 = add i8 %sum1, %sum2 217 %sum6 = add i8 %sum3, %sum4 218 %sum7 = add i8 %sum5, %sum6 219 ret i8 %sum7 220} 221 222define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) { 223; CHECK-LABEL: ld_global_v2i32( 224; CHECK: { 225; CHECK-NEXT: .reg .b32 %r<4>; 226; CHECK-NEXT: .reg .b64 %rd<2>; 227; CHECK-EMPTY: 228; CHECK-NEXT: // %bb.0: 229; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2i32_param_0]; 230; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; 231; CHECK-NEXT: add.s32 %r3, %r1, %r2; 232; CHECK-NEXT: st.param.b32 [func_retval0], %r3; 233; CHECK-NEXT: ret; 234 %a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 235 %v1 = extractelement <2 x i32> %a, i32 0 236 %v2 = extractelement <2 x i32> %a, i32 1 237 %sum = add i32 %v1, %v2 238 ret i32 %sum 239} 240 241define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) { 242; CHECK-LABEL: ld_global_v4i32( 243; CHECK: { 244; CHECK-NEXT: .reg .b32 %r<8>; 245; CHECK-NEXT: .reg .b64 %rd<2>; 246; CHECK-EMPTY: 247; CHECK-NEXT: // %bb.0: 248; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4i32_param_0]; 249; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 250; CHECK-NEXT: add.s32 %r5, %r1, %r2; 251; CHECK-NEXT: add.s32 %r6, %r3, %r4; 252; CHECK-NEXT: add.s32 %r7, %r5, %r6; 253; CHECK-NEXT: st.param.b32 [func_retval0], %r7; 254; CHECK-NEXT: ret; 255 %a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 256 %v1 = extractelement <4 x i32> %a, i32 0 257 %v2 = extractelement <4 x i32> %a, i32 1 258 %v3 = extractelement <4 x i32> %a, i32 2 259 %v4 = extractelement <4 x i32> %a, i32 3 260 %sum1 = add i32 %v1, %v2 261 %sum2 = add i32 %v3, %v4 262 %sum3 = add i32 %sum1, %sum2 263 ret i32 %sum3 264} 265 266define i32 @ld_not_invariant(ptr addrspace(1) %ptr) { 267; CHECK-LABEL: ld_not_invariant( 268; CHECK: { 269; CHECK-NEXT: .reg .b32 %r<2>; 270; CHECK-NEXT: .reg .b64 %rd<2>; 271; CHECK-EMPTY: 272; CHECK-NEXT: // %bb.0: 273; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_invariant_param_0]; 274; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; 275; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 276; CHECK-NEXT: ret; 277 %a = load i32, ptr addrspace(1) %ptr 278 ret i32 %a 279} 280 281define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) { 282; CHECK-LABEL: ld_not_global_addrspace( 283; CHECK: { 284; CHECK-NEXT: .reg .b32 %r<2>; 285; CHECK-NEXT: .reg .b64 %rd<2>; 286; CHECK-EMPTY: 287; CHECK-NEXT: // %bb.0: 288; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_global_addrspace_param_0]; 289; CHECK-NEXT: ld.u32 %r1, [%rd1]; 290; CHECK-NEXT: st.param.b32 [func_retval0], %r1; 291; CHECK-NEXT: ret; 292 %a = load i32, ptr addrspace(0) %ptr 293 ret i32 %a 294} 295 296!0 = !{} 297