1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes='require<profile-summary>,function(codegenprepare)' -S %s | FileCheck %s 3 4target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" 5target triple = "arm64-apple-ios" 6 7; It's profitable to convert the zext to a shuffle, which in turn will be 8; lowered to 4 tbl instructions. The masks are materialized outside the loop. 9define void @zext_v16i8_to_v16i32_in_loop(ptr %src, ptr %dst) { 10; CHECK-LABEL: @zext_v16i8_to_v16i32_in_loop( 11; CHECK-NEXT: entry: 12; CHECK-NEXT: br label [[LOOP:%.*]] 13; CHECK: loop: 14; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 15; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]] 16; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16 17; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[LOAD]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <64 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 5, i32 16, i32 16, i32 16, i32 6, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 9, i32 16, i32 16, i32 16, i32 10, i32 16, i32 16, i32 16, i32 11, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 15, i32 16, i32 16, i32 16> 18; CHECK-NEXT: [[TMP1:%.*]] = bitcast <64 x i8> [[TMP0]] to <16 x i32> 19; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[IV]] 20; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[DST_GEP]], align 64 21; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 22; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 23; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] 24; CHECK: exit: 25; CHECK-NEXT: ret void 26; 27entry: 28 br label %loop 29 30loop: 31 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 32 %src.gep = getelementptr i8, ptr %src, i64 %iv 33 %load = load <16 x i8>, ptr %src.gep 34 %ext = zext <16 x i8> %load to <16 x i32> 35 %dst.gep = getelementptr i32, ptr %dst, i64 %iv 36 store <16 x i32> %ext, ptr %dst.gep 37 %iv.next = add nuw i64 %iv, 16 38 %ec = icmp eq i64 %iv.next, 128 39 br i1 %ec, label %exit, label %loop 40 41exit: 42 ret void 43} 44 45; Not profitable to use shuffle/tbl, as 4 tbls + materializing the masks 46; require more instructions than lowering zext directly. 47define void @zext_v16i8_to_v16i32_no_loop(ptr %src, ptr %dst) { 48; CHECK-LABEL: @zext_v16i8_to_v16i32_no_loop( 49; CHECK-NEXT: entry: 50; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 16 51; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i32> 52; CHECK-NEXT: store <16 x i32> [[EXT]], ptr [[DST:%.*]], align 64 53; CHECK-NEXT: ret void 54; 55entry: 56 %load = load <16 x i8>, ptr %src 57 %ext = zext <16 x i8> %load to <16 x i32> 58 store <16 x i32> %ext, ptr %dst 59 ret void 60} 61 62define void @zext_v16i8_to_v16i16_in_loop(ptr %src, ptr %dst) { 63; CHECK-LABEL: @zext_v16i8_to_v16i16_in_loop( 64; CHECK-NEXT: entry: 65; CHECK-NEXT: br label [[LOOP:%.*]] 66; CHECK: loop: 67; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 68; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]] 69; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16 70; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i16> 71; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i16, ptr [[DST:%.*]], i64 [[IV]] 72; CHECK-NEXT: store <16 x i16> [[EXT]], ptr [[DST_GEP]], align 32 73; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 74; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 75; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] 76; CHECK: exit: 77; CHECK-NEXT: ret void 78; 79entry: 80 br label %loop 81 82loop: 83 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 84 %src.gep = getelementptr i8, ptr %src, i64 %iv 85 %load = load <16 x i8>, ptr %src.gep 86 %ext = zext <16 x i8> %load to <16 x i16> 87 %dst.gep = getelementptr i16, ptr %dst, i64 %iv 88 store <16 x i16> %ext, ptr %dst.gep 89 %iv.next = add nuw i64 %iv, 16 90 %ec = icmp eq i64 %iv.next, 128 91 br i1 %ec, label %exit, label %loop 92 93exit: 94 ret void 95} 96 97define void @zext_v8i8_to_v8i32_in_loop(ptr %src, ptr %dst) { 98; CHECK-LABEL: @zext_v8i8_to_v8i32_in_loop( 99; CHECK-NEXT: entry: 100; CHECK-NEXT: br label [[LOOP:%.*]] 101; CHECK: loop: 102; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 103; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]] 104; CHECK-NEXT: [[LOAD:%.*]] = load <8 x i8>, ptr [[SRC_GEP]], align 8 105; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[LOAD]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8> 106; CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i8> [[TMP0]] to <8 x i32> 107; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[IV]] 108; CHECK-NEXT: store <8 x i32> [[TMP1]], ptr [[DST_GEP]], align 32 109; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 110; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 111; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] 112; CHECK: exit: 113; CHECK-NEXT: ret void 114; 115entry: 116 br label %loop 117 118loop: 119 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 120 %src.gep = getelementptr i8, ptr %src, i64 %iv 121 %load = load <8 x i8>, ptr %src.gep 122 %ext = zext <8 x i8> %load to <8 x i32> 123 %dst.gep = getelementptr i32, ptr %dst, i64 %iv 124 store <8 x i32> %ext, ptr %dst.gep 125 %iv.next = add nuw i64 %iv, 16 126 %ec = icmp eq i64 %iv.next, 128 127 br i1 %ec, label %exit, label %loop 128 129exit: 130 ret void 131} 132 133define void @zext_v16i8_to_v16i64_in_loop(ptr %src, ptr %dst) { 134; CHECK-LABEL: @zext_v16i8_to_v16i64_in_loop( 135; CHECK-NEXT: entry: 136; CHECK-NEXT: br label [[LOOP:%.*]] 137; CHECK: loop: 138; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 139; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[IV]] 140; CHECK-NEXT: [[LOAD:%.*]] = load <16 x i8>, ptr [[SRC_GEP]], align 16 141; CHECK-NEXT: [[EXT:%.*]] = zext <16 x i8> [[LOAD]] to <16 x i64> 142; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr i64, ptr [[DST:%.*]], i64 [[IV]] 143; CHECK-NEXT: store <16 x i64> [[EXT]], ptr [[DST_GEP]], align 128 144; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 145; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 128 146; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] 147; CHECK: exit: 148; CHECK-NEXT: ret void 149; 150entry: 151 br label %loop 152 153loop: 154 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 155 %src.gep = getelementptr i8, ptr %src, i64 %iv 156 %load = load <16 x i8>, ptr %src.gep 157 %ext = zext <16 x i8> %load to <16 x i64> 158 %dst.gep = getelementptr i64, ptr %dst, i64 %iv 159 store <16 x i64> %ext, ptr %dst.gep 160 %iv.next = add nuw i64 %iv, 16 161 %ec = icmp eq i64 %iv.next, 128 162 br i1 %ec, label %exit, label %loop 163 164exit: 165 ret void 166} 167