1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 2; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s 3 4; This test verifies that the vectorizer can handle an extended sequence of 5; getelementptr instructions and generate longer vectors. With special handling, 6; some elements can still be vectorized even if they require looking up the 7; common underlying object deeper than 6 levels from the original pointer. 8 9; The test below is the simplified version of actual performance oriented 10; workload; the offsets in getelementptr instructions are similar or same for 11; the test simplicity. 12 13define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) { 14; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8( 15; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] { 16; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504 17; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]] 18; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768 19; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]] 20; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]] 21; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]] 22; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16 23; CHECK-NEXT: ret void 24; 25 26 %level1 = getelementptr i8, ptr %arg1, i32 917504 27 %level2 = getelementptr i8, ptr %level1, i32 %arg0 28 %level3 = getelementptr i8, ptr %level2, i32 32768 29 %level4 = getelementptr i8, ptr %level3, i32 %arg0 30 %level5 = getelementptr i8, ptr %level4, i32 %arg0 31 32 %a6 = getelementptr i8, ptr %level5, i32 %arg0 33 %b7 = getelementptr i8, ptr %a6, i32 2 34 %c8 = getelementptr i8, ptr %b7, i32 8 35 %d8 = getelementptr i8, ptr %b7, i32 12 36 37 store half 0xH0000, ptr %a6, align 16 38 store <4 x half> zeroinitializer, ptr %b7, align 2 39 store <2 x half> zeroinitializer, ptr %c8, align 2 40 store half 0xH0000, ptr %d8, align 2 41 ret void 42} 43 44define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) { 45; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13( 46; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] { 47; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504 48; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]] 49; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768 50; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]] 51; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]] 52; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]] 53; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16 54; CHECK-NEXT: ret void 55; 56 57 %level1 = getelementptr i8, ptr %arg1, i32 917504 58 %level2 = getelementptr i8, ptr %level1, i32 %arg0 59 %level3 = getelementptr i8, ptr %level2, i32 32768 60 %level4 = getelementptr i8, ptr %level3, i32 %arg0 61 %level5 = getelementptr i8, ptr %level4, i32 %arg0 62 63 %a6 = getelementptr i8, ptr %level5, i32 %arg0 64 %b7 = getelementptr i8, ptr %a6, i32 2 65 %c8 = getelementptr i8, ptr %b7, i32 2 66 %d9 = getelementptr i8, ptr %c8, i32 2 67 %e10 = getelementptr i8, ptr %d9, i32 2 68 %f11 = getelementptr i8, ptr %e10, i32 2 69 %g12 = getelementptr i8, ptr %f11, i32 2 70 %h13 = getelementptr i8, ptr %g12, i32 2 71 72 store half 0xH0000, ptr %a6, align 16 73 store half 0xH0000, ptr %b7, align 2 74 store half 0xH0000, ptr %c8, align 2 75 store half 0xH0000, ptr %d9, align 2 76 store half 0xH0000, ptr %e10, align 8 77 store half 0xH0000, ptr %f11, align 2 78 store half 0xH0000, ptr %g12, align 2 79 store half 0xH0000, ptr %h13, align 2 80 ret void 81} 82 83define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) { 84; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7( 85; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] { 86; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752 87; CHECK-NEXT: br [[DOTPREHEADER11_PREHEADER:label %.*]] 88; CHECK: [[_PREHEADER11_PREHEADER:.*:]] 89; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6 90; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]] 91; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]] 92; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]] 93; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2 94; CHECK-NEXT: br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]] 95; CHECK: [[_LR_PH:.*:]] 96; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]] 97; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]] 98; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0 99; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1 100; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2 101; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3 102; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4 103; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0 104; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5 105; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1 106; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6 107; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7 108; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2 109; CHECK-NEXT: br [[DOTEXIT_POINT]] 110; CHECK: [[_EXIT_POINT:.*:]] 111; CHECK-NEXT: ret void 112; 113 %base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752 114 br label %.preheader11.preheader 115 116.preheader11.preheader: 117 %base2 = shl nuw nsw i32 %arg0, 6 118 %base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2 119 120 %base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2 121 %base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3 122 123 %cmp = icmp sgt i32 %arg0, 2 124 br i1 %cmp, label %.lr.ph, label %.exit_point 125 126.lr.ph: 127 %gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4 128 129 %dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5 130 %dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2 131 %dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10 132 %dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14 133 134 store half %arg6_half, ptr addrspace(3) %dst, align 2 135 store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2 136 store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2 137 store half %arg7_half, ptr addrspace(3) %dst_off14, align 2 138 br label %.exit_point 139 140.exit_point: 141 ret void 142} 143 144; The regression test for merging equivalence classes. It is reduced and adapted 145; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at 146; post-commit checks with memory sanitizer on the initial attempt to implement 147; the merging of the equivalence classes. 148define void @variadics1(ptr %vlist) { 149; CHECK-LABEL: define void @variadics1( 150; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] { 151; CHECK-NEXT: [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0) 152; CHECK-NEXT: [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8 153; CHECK-NEXT: [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7 154; CHECK-NEXT: [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0) 155; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8 156; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7 157; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0) 158; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296 159; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 160; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 161; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]] 162; CHECK-NEXT: store double [[X5]], ptr null, align 8 163; CHECK-NEXT: ret void 164; 165 %argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0) 166 %argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8 167 %x0 = getelementptr i8, ptr %argp.next8, i32 7 168 %argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0) 169 %argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8 170 %x2 = getelementptr i8, ptr %argp.next12, i32 7 171 %argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0) 172 %x3 = load double, ptr %argp.cur16.aligned, align 8 173 %argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8 174 %x4 = load double, ptr %argp.cur16.aligned_off8, align 8 175 %x5 = fadd double %x4, %x3 176 store double %x5, ptr null, align 8 177 ret void 178} 179 180declare ptr @llvm.ptrmask.p0.i64(ptr, i64) 181