1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 2; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=DEFAULT %s 3; RUN: opt -p loop-vectorize -mcpu=neoverse-v1 -S %s | FileCheck --check-prefixes=VSCALEFORTUNING2 %s 4; RUN: opt -p loop-vectorize -mcpu=neoverse-v1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S %s | FileCheck --check-prefixes=PRED %s 5 6target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" 7target triple = "arm64-apple-macosx14.0.0" 8 9define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) #0 { 10; DEFAULT-LABEL: define i32 @chained_recurrences( 11; DEFAULT-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] { 12; DEFAULT-NEXT: entry: 13; DEFAULT-NEXT: br label [[LOOP:%.*]] 14; DEFAULT: loop: 15; DEFAULT-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP68:%.*]], [[LOOP]] ] 16; DEFAULT-NEXT: [[SCALAR_RECUR15:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP0]], [[LOOP]] ] 17; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 18; DEFAULT-NEXT: [[SUM_RED:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RED_2:%.*]], [[LOOP]] ] 19; DEFAULT-NEXT: [[TMP67:%.*]] = add i64 [[Y]], 1 20; DEFAULT-NEXT: [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP67]] 21; DEFAULT-NEXT: [[TMP68]] = load i32, ptr [[GEP_1]], align 4 22; DEFAULT-NEXT: [[OR3:%.*]] = or i32 [[SCALAR_RECUR15]], [[X]] 23; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 24; DEFAULT-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 1 25; DEFAULT-NEXT: [[TMP69:%.*]] = shl i32 [[OR3]], 1 26; DEFAULT-NEXT: [[TMP70:%.*]] = or i32 [[TMP69]], 2 27; DEFAULT-NEXT: [[SHL19:%.*]] = shl i32 [[X]], 1 28; DEFAULT-NEXT: [[TMP71:%.*]] = or i32 [[SHR]], [[SHL19]] 29; DEFAULT-NEXT: [[TMP72:%.*]] = or i32 [[TMP71]], [[TMP70]] 30; DEFAULT-NEXT: [[TMP73:%.*]] = or i32 [[TMP72]], [[X]] 31; DEFAULT-NEXT: [[OR20:%.*]] = or i32 [[Z]], [[X]] 32; DEFAULT-NEXT: [[NOT:%.*]] = and i32 [[OR20]], 1 33; DEFAULT-NEXT: [[AND:%.*]] = xor i32 [[NOT]], 1 34; DEFAULT-NEXT: [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64 35; DEFAULT-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]] 36; DEFAULT-NEXT: [[TMP74:%.*]] = load i32, ptr [[GEP_2]], align 4 37; DEFAULT-NEXT: [[SHR24:%.*]] = lshr i32 [[TMP73]], 1 38; DEFAULT-NEXT: [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64 39; DEFAULT-NEXT: [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]] 40; DEFAULT-NEXT: [[TMP75:%.*]] = load i32, ptr [[GEP_3]], align 4 41; DEFAULT-NEXT: [[RED_1:%.*]] = or i32 [[TMP74]], [[SUM_RED]] 42; DEFAULT-NEXT: [[RED_2]] = or i32 [[RED_1]], [[TMP75]] 43; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]] 44; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] 45; DEFAULT: exit: 46; DEFAULT-NEXT: [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ] 47; DEFAULT-NEXT: ret i32 [[RED_2_LCSSA]] 48; 49; VSCALEFORTUNING2-LABEL: define i32 @chained_recurrences( 50; VSCALEFORTUNING2-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] { 51; VSCALEFORTUNING2-NEXT: entry: 52; VSCALEFORTUNING2-NEXT: [[TMP0:%.*]] = add i64 [[Y]], 1 53; VSCALEFORTUNING2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() 54; VSCALEFORTUNING2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 55; VSCALEFORTUNING2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] 56; VSCALEFORTUNING2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 57; VSCALEFORTUNING2: vector.ph: 58; VSCALEFORTUNING2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 59; VSCALEFORTUNING2-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 60; VSCALEFORTUNING2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] 61; VSCALEFORTUNING2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] 62; VSCALEFORTUNING2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() 63; VSCALEFORTUNING2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 64; VSCALEFORTUNING2-NEXT: [[TMP7:%.*]] = add i64 [[Y]], 1 65; VSCALEFORTUNING2-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP7]] 66; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0 67; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 68; VSCALEFORTUNING2-NEXT: [[TMP9:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1) 69; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1) 70; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = or <vscale x 4 x i32> [[TMP9]], [[TMP10]] 71; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0 72; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 73; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]] 74; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = and <vscale x 4 x i32> [[TMP12]], splat (i32 1) 75; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i32> [[TMP13]], splat (i32 1) 76; VSCALEFORTUNING2-NEXT: [[TMP15:%.*]] = zext <vscale x 4 x i32> [[TMP14]] to <vscale x 4 x i64> 77; VSCALEFORTUNING2-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i64> [[TMP15]], i32 0 78; VSCALEFORTUNING2-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP16]] 79; VSCALEFORTUNING2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP17]], i64 0 80; VSCALEFORTUNING2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer 81; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() 82; VSCALEFORTUNING2-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 4 83; VSCALEFORTUNING2-NEXT: [[TMP20:%.*]] = sub i32 [[TMP19]], 1 84; VSCALEFORTUNING2-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP20]] 85; VSCALEFORTUNING2-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() 86; VSCALEFORTUNING2-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 87; VSCALEFORTUNING2-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 88; VSCALEFORTUNING2-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP23]] 89; VSCALEFORTUNING2-NEXT: br label [[VECTOR_BODY:%.*]] 90; VSCALEFORTUNING2: vector.body: 91; VSCALEFORTUNING2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 92; VSCALEFORTUNING2-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT7:%.*]], [[VECTOR_BODY]] ] 93; VSCALEFORTUNING2-NEXT: [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ] 94; VSCALEFORTUNING2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ] 95; VSCALEFORTUNING2-NEXT: [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ] 96; VSCALEFORTUNING2-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP8]], align 4 97; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP24]], i64 0 98; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT7]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT6]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 99; VSCALEFORTUNING2-NEXT: [[TMP25:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT7]], i32 -1) 100; VSCALEFORTUNING2-NEXT: [[TMP26]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT7]], <vscale x 4 x i32> [[BROADCAST_SPLAT7]], i32 -1) 101; VSCALEFORTUNING2-NEXT: [[TMP27:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP25]], i32 -1) 102; VSCALEFORTUNING2-NEXT: [[TMP28:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[TMP25]], <vscale x 4 x i32> [[TMP26]], i32 -1) 103; VSCALEFORTUNING2-NEXT: [[TMP29:%.*]] = or <vscale x 4 x i32> [[TMP27]], [[BROADCAST_SPLAT]] 104; VSCALEFORTUNING2-NEXT: [[TMP30:%.*]] = or <vscale x 4 x i32> [[TMP28]], [[BROADCAST_SPLAT]] 105; VSCALEFORTUNING2-NEXT: [[TMP31:%.*]] = shl <vscale x 4 x i32> [[TMP29]], splat (i32 1) 106; VSCALEFORTUNING2-NEXT: [[TMP32:%.*]] = shl <vscale x 4 x i32> [[TMP30]], splat (i32 1) 107; VSCALEFORTUNING2-NEXT: [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP31]], splat (i32 2) 108; VSCALEFORTUNING2-NEXT: [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP32]], splat (i32 2) 109; VSCALEFORTUNING2-NEXT: [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP11]], [[TMP33]] 110; VSCALEFORTUNING2-NEXT: [[TMP36:%.*]] = or <vscale x 4 x i32> [[TMP11]], [[TMP34]] 111; VSCALEFORTUNING2-NEXT: [[TMP37:%.*]] = or <vscale x 4 x i32> [[TMP35]], [[BROADCAST_SPLAT]] 112; VSCALEFORTUNING2-NEXT: [[TMP38:%.*]] = or <vscale x 4 x i32> [[TMP36]], [[BROADCAST_SPLAT]] 113; VSCALEFORTUNING2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison) 114; VSCALEFORTUNING2-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison) 115; VSCALEFORTUNING2-NEXT: [[TMP39:%.*]] = lshr <vscale x 4 x i32> [[TMP37]], splat (i32 1) 116; VSCALEFORTUNING2-NEXT: [[TMP40:%.*]] = lshr <vscale x 4 x i32> [[TMP38]], splat (i32 1) 117; VSCALEFORTUNING2-NEXT: [[TMP41:%.*]] = zext <vscale x 4 x i32> [[TMP39]] to <vscale x 4 x i64> 118; VSCALEFORTUNING2-NEXT: [[TMP42:%.*]] = zext <vscale x 4 x i32> [[TMP40]] to <vscale x 4 x i64> 119; VSCALEFORTUNING2-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP41]] 120; VSCALEFORTUNING2-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP42]] 121; VSCALEFORTUNING2-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP43]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison) 122; VSCALEFORTUNING2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP44]], i32 4, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison) 123; VSCALEFORTUNING2-NEXT: [[TMP45:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]] 124; VSCALEFORTUNING2-NEXT: [[TMP46:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER8]], [[VEC_PHI5]] 125; VSCALEFORTUNING2-NEXT: [[TMP47]] = or <vscale x 4 x i32> [[TMP45]], [[WIDE_MASKED_GATHER9]] 126; VSCALEFORTUNING2-NEXT: [[TMP48]] = or <vscale x 4 x i32> [[TMP46]], [[WIDE_MASKED_GATHER10]] 127; VSCALEFORTUNING2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] 128; VSCALEFORTUNING2-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 129; VSCALEFORTUNING2-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 130; VSCALEFORTUNING2: middle.block: 131; VSCALEFORTUNING2-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i32> [[TMP48]], [[TMP47]] 132; VSCALEFORTUNING2-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]]) 133; VSCALEFORTUNING2-NEXT: [[TMP51:%.*]] = call i32 @llvm.vscale.i32() 134; VSCALEFORTUNING2-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], 4 135; VSCALEFORTUNING2-NEXT: [[TMP53:%.*]] = sub i32 [[TMP52]], 1 136; VSCALEFORTUNING2-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP26]], i32 [[TMP53]] 137; VSCALEFORTUNING2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] 138; VSCALEFORTUNING2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] 139; VSCALEFORTUNING2: scalar.ph: 140; VSCALEFORTUNING2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 141; VSCALEFORTUNING2-NEXT: [[SCALAR_RECUR_INIT11:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 142; VSCALEFORTUNING2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 143; VSCALEFORTUNING2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 144; VSCALEFORTUNING2-NEXT: br label [[LOOP:%.*]] 145; VSCALEFORTUNING2: loop: 146; VSCALEFORTUNING2-NEXT: [[TMP54:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP57:%.*]], [[LOOP]] ] 147; VSCALEFORTUNING2-NEXT: [[TMP55:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT11]], [[SCALAR_PH]] ], [ [[TMP54]], [[LOOP]] ] 148; VSCALEFORTUNING2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 149; VSCALEFORTUNING2-NEXT: [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP]] ] 150; VSCALEFORTUNING2-NEXT: [[TMP56:%.*]] = add i64 [[Y]], 1 151; VSCALEFORTUNING2-NEXT: [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP56]] 152; VSCALEFORTUNING2-NEXT: [[TMP57]] = load i32, ptr [[GEP_1]], align 4 153; VSCALEFORTUNING2-NEXT: [[OR3:%.*]] = or i32 [[TMP55]], [[X]] 154; VSCALEFORTUNING2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 155; VSCALEFORTUNING2-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 1 156; VSCALEFORTUNING2-NEXT: [[TMP58:%.*]] = shl i32 [[OR3]], 1 157; VSCALEFORTUNING2-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 2 158; VSCALEFORTUNING2-NEXT: [[SHL19:%.*]] = shl i32 [[X]], 1 159; VSCALEFORTUNING2-NEXT: [[TMP60:%.*]] = or i32 [[SHR]], [[SHL19]] 160; VSCALEFORTUNING2-NEXT: [[TMP61:%.*]] = or i32 [[TMP60]], [[TMP59]] 161; VSCALEFORTUNING2-NEXT: [[TMP62:%.*]] = or i32 [[TMP61]], [[X]] 162; VSCALEFORTUNING2-NEXT: [[OR20:%.*]] = or i32 [[Z]], [[X]] 163; VSCALEFORTUNING2-NEXT: [[NOT:%.*]] = and i32 [[OR20]], 1 164; VSCALEFORTUNING2-NEXT: [[AND:%.*]] = xor i32 [[NOT]], 1 165; VSCALEFORTUNING2-NEXT: [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64 166; VSCALEFORTUNING2-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]] 167; VSCALEFORTUNING2-NEXT: [[TMP63:%.*]] = load i32, ptr [[GEP_2]], align 4 168; VSCALEFORTUNING2-NEXT: [[SHR24:%.*]] = lshr i32 [[TMP62]], 1 169; VSCALEFORTUNING2-NEXT: [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64 170; VSCALEFORTUNING2-NEXT: [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]] 171; VSCALEFORTUNING2-NEXT: [[TMP64:%.*]] = load i32, ptr [[GEP_3]], align 4 172; VSCALEFORTUNING2-NEXT: [[RED_1:%.*]] = or i32 [[TMP63]], [[SUM_RED]] 173; VSCALEFORTUNING2-NEXT: [[RED_2]] = or i32 [[RED_1]], [[TMP64]] 174; VSCALEFORTUNING2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[Y]] 175; VSCALEFORTUNING2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] 176; VSCALEFORTUNING2: exit: 177; VSCALEFORTUNING2-NEXT: [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ] 178; VSCALEFORTUNING2-NEXT: ret i32 [[RED_2_LCSSA]] 179; 180; PRED-LABEL: define i32 @chained_recurrences( 181; PRED-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] { 182; PRED-NEXT: entry: 183; PRED-NEXT: [[TMP0:%.*]] = add i64 [[Y]], 1 184; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] 185; PRED: vector.ph: 186; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() 187; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 188; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], 1 189; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP3]] 190; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] 191; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] 192; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 193; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 194; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() 195; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 196; PRED-NEXT: [[TMP8:%.*]] = sub i64 [[TMP0]], [[TMP7]] 197; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[TMP0]], [[TMP7]] 198; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 199; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]]) 200; PRED-NEXT: [[TMP11:%.*]] = add i64 [[Y]], 1 201; PRED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP11]] 202; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0 203; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 204; PRED-NEXT: [[TMP13:%.*]] = lshr <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1) 205; PRED-NEXT: [[TMP14:%.*]] = shl <vscale x 4 x i32> [[BROADCAST_SPLAT]], splat (i32 1) 206; PRED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i32> [[TMP13]], [[TMP14]] 207; PRED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z]], i64 0 208; PRED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 209; PRED-NEXT: [[TMP16:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT]] 210; PRED-NEXT: [[TMP17:%.*]] = and <vscale x 4 x i32> [[TMP16]], splat (i32 1) 211; PRED-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i32> [[TMP17]], splat (i32 1) 212; PRED-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[TMP18]] to <vscale x 4 x i64> 213; PRED-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i64> [[TMP19]], i32 0 214; PRED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[TMP20]] 215; PRED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP21]], i64 0 216; PRED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[DOTSPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer 217; PRED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() 218; PRED-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 4 219; PRED-NEXT: [[TMP24:%.*]] = sub i32 [[TMP23]], 1 220; PRED-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP24]] 221; PRED-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() 222; PRED-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], 4 223; PRED-NEXT: [[TMP27:%.*]] = sub i32 [[TMP26]], 1 224; PRED-NEXT: [[VECTOR_RECUR_INIT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 0, i32 [[TMP27]] 225; PRED-NEXT: br label [[LOOP:%.*]] 226; PRED: vector.body: 227; PRED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 228; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[LOOP]] ] 229; PRED-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT]], [[ENTRY]] ], [ [[BROADCAST_SPLAT6:%.*]], [[LOOP]] ] 230; PRED-NEXT: [[VECTOR_RECUR4:%.*]] = phi <vscale x 4 x i32> [ [[VECTOR_RECUR_INIT3]], [[ENTRY]] ], [ [[TMP29:%.*]], [[LOOP]] ] 231; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP41:%.*]], [[LOOP]] ] 232; PRED-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP12]], align 4 233; PRED-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP28]], i64 0 234; PRED-NEXT: [[BROADCAST_SPLAT6]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 235; PRED-NEXT: [[TMP29]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR]], <vscale x 4 x i32> [[BROADCAST_SPLAT6]], i32 -1) 236; PRED-NEXT: [[TMP30:%.*]] = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> [[VECTOR_RECUR4]], <vscale x 4 x i32> [[TMP29]], i32 -1) 237; PRED-NEXT: [[TMP31:%.*]] = or <vscale x 4 x i32> [[TMP30]], [[BROADCAST_SPLAT]] 238; PRED-NEXT: [[TMP32:%.*]] = shl <vscale x 4 x i32> [[TMP31]], splat (i32 1) 239; PRED-NEXT: [[TMP33:%.*]] = or <vscale x 4 x i32> [[TMP32]], splat (i32 2) 240; PRED-NEXT: [[TMP34:%.*]] = or <vscale x 4 x i32> [[TMP15]], [[TMP33]] 241; PRED-NEXT: [[TMP35:%.*]] = or <vscale x 4 x i32> [[TMP34]], [[BROADCAST_SPLAT]] 242; PRED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[DOTSPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) 243; PRED-NEXT: [[TMP36:%.*]] = lshr <vscale x 4 x i32> [[TMP35]], splat (i32 1) 244; PRED-NEXT: [[TMP37:%.*]] = zext <vscale x 4 x i32> [[TMP36]] to <vscale x 4 x i64> 245; PRED-NEXT: [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_2]], <vscale x 4 x i64> [[TMP37]] 246; PRED-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP38]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) 247; PRED-NEXT: [[TMP39:%.*]] = or <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]] 248; PRED-NEXT: [[TMP40:%.*]] = or <vscale x 4 x i32> [[TMP39]], [[WIDE_MASKED_GATHER7]] 249; PRED-NEXT: [[TMP41]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI]] 250; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP5]] 251; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[IV]], i64 [[TMP10]]) 252; PRED-NEXT: [[TMP42:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) 253; PRED-NEXT: [[TMP43:%.*]] = extractelement <vscale x 4 x i1> [[TMP42]], i32 0 254; PRED-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] 255; PRED: middle.block: 256; PRED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> [[TMP41]]) 257; PRED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() 258; PRED-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 4 259; PRED-NEXT: [[TMP47:%.*]] = sub i32 [[TMP46]], 1 260; PRED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i32> [[TMP29]], i32 [[TMP47]] 261; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] 262; PRED: scalar.ph: 263; PRED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] 264; PRED-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ] 265; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ] 266; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1]] ] 267; PRED-NEXT: br label [[LOOP1:%.*]] 268; PRED: loop: 269; PRED-NEXT: [[TMP48:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP53:%.*]], [[LOOP1]] ] 270; PRED-NEXT: [[SCALAR_RECUR10:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], [[SCALAR_PH]] ], [ [[TMP48]], [[LOOP1]] ] 271; PRED-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[LOOP1]] ] 272; PRED-NEXT: [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_2:%.*]], [[LOOP1]] ] 273; PRED-NEXT: [[TMP52:%.*]] = add i64 [[Y]], 1 274; PRED-NEXT: [[GEP_1:%.*]] = getelementptr i32, ptr [[SRC_1]], i64 [[TMP52]] 275; PRED-NEXT: [[TMP53]] = load i32, ptr [[GEP_1]], align 4 276; PRED-NEXT: [[OR3:%.*]] = or i32 [[SCALAR_RECUR10]], [[X]] 277; PRED-NEXT: [[IV_NEXT1]] = add i64 [[IV1]], 1 278; PRED-NEXT: [[SHR:%.*]] = lshr i32 [[X]], 1 279; PRED-NEXT: [[TMP54:%.*]] = shl i32 [[OR3]], 1 280; PRED-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 2 281; PRED-NEXT: [[SHL19:%.*]] = shl i32 [[X]], 1 282; PRED-NEXT: [[TMP56:%.*]] = or i32 [[SHR]], [[SHL19]] 283; PRED-NEXT: [[TMP57:%.*]] = or i32 [[TMP56]], [[TMP55]] 284; PRED-NEXT: [[TMP58:%.*]] = or i32 [[TMP57]], [[X]] 285; PRED-NEXT: [[OR20:%.*]] = or i32 [[Z]], [[X]] 286; PRED-NEXT: [[NOT:%.*]] = and i32 [[OR20]], 1 287; PRED-NEXT: [[AND:%.*]] = xor i32 [[NOT]], 1 288; PRED-NEXT: [[IDX_EXT_1:%.*]] = zext i32 [[AND]] to i64 289; PRED-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_1]] 290; PRED-NEXT: [[TMP59:%.*]] = load i32, ptr [[GEP_2]], align 4 291; PRED-NEXT: [[SHR24:%.*]] = lshr i32 [[TMP58]], 1 292; PRED-NEXT: [[IDX_EXT_2:%.*]] = zext i32 [[SHR24]] to i64 293; PRED-NEXT: [[GEP_3:%.*]] = getelementptr i32, ptr [[SRC_2]], i64 [[IDX_EXT_2]] 294; PRED-NEXT: [[TMP60:%.*]] = load i32, ptr [[GEP_3]], align 4 295; PRED-NEXT: [[RED_1:%.*]] = or i32 [[TMP59]], [[SUM_RED]] 296; PRED-NEXT: [[RED_2]] = or i32 [[RED_1]], [[TMP60]] 297; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[Y]] 298; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] 299; PRED: exit: 300; PRED-NEXT: [[RED_2_LCSSA:%.*]] = phi i32 [ [[RED_2]], [[LOOP1]] ], [ [[TMP44]], [[MIDDLE_BLOCK]] ] 301; PRED-NEXT: ret i32 [[RED_2_LCSSA]] 302; 303entry: 304 br label %loop 305 306loop: 307 %2 = phi i32 [ 0, %entry ], [ %5, %loop ] 308 %3 = phi i32 [ 0, %entry ], [ %2, %loop ] 309 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 310 %sum.red = phi i32 [ 0, %entry ], [ %red.2, %loop ] 311 %4 = add i64 %y, 1 312 %gep.1 = getelementptr i32, ptr %src.1, i64 %4 313 %5 = load i32, ptr %gep.1, align 4 314 %or3 = or i32 %3, %x 315 %iv.next = add i64 %iv, 1 316 %shr = lshr i32 %x, 1 317 %6 = shl i32 %or3, 1 318 %7 = or i32 %6, 2 319 %shl19 = shl i32 %x, 1 320 %8 = or i32 %shr, %shl19 321 %9 = or i32 %8, %7 322 %10 = or i32 %9, %x 323 %or20 = or i32 %z, %x 324 %not = and i32 %or20, 1 325 %and = xor i32 %not, 1 326 %idx.ext.1 = zext i32 %and to i64 327 %gep.2 = getelementptr i32, ptr %src.2, i64 %idx.ext.1 328 %11 = load i32, ptr %gep.2, align 4 329 %shr24 = lshr i32 %10, 1 330 %idx.ext.2 = zext i32 %shr24 to i64 331 %gep.3 = getelementptr i32, ptr %src.2, i64 %idx.ext.2 332 %12 = load i32, ptr %gep.3, align 4 333 %red.1 = or i32 %11, %sum.red 334 %red.2 = or i32 %red.1, %12 335 %ec = icmp eq i64 %iv, %y 336 br i1 %ec, label %exit, label %loop 337 338exit: 339 ret i32 %red.2 340} 341 342define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { 343; DEFAULT-LABEL: define i16 @reduce_udiv( 344; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 345; DEFAULT-NEXT: entry: 346; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 347; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() 348; DEFAULT-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 349; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] 350; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 351; DEFAULT: vector.ph: 352; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 353; DEFAULT-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 354; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] 355; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] 356; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() 357; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 358; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0 359; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer 360; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] 361; DEFAULT: vector.body: 362; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 363; DEFAULT-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] 364; DEFAULT-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] 365; DEFAULT-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 366; DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP7]] 367; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP13]], i32 0 368; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() 369; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 370; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP17]] 371; DEFAULT-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP15]], align 2 372; DEFAULT-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP18]], align 2 373; DEFAULT-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] 374; DEFAULT-NEXT: [[TMP20:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]] 375; DEFAULT-NEXT: [[TMP21]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]] 376; DEFAULT-NEXT: [[TMP22]] = or <vscale x 4 x i16> [[TMP20]], [[VEC_PHI1]] 377; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] 378; DEFAULT-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 379; DEFAULT-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 380; DEFAULT: middle.block: 381; DEFAULT-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP22]], [[TMP21]] 382; DEFAULT-NEXT: [[TMP24:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]]) 383; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] 384; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] 385; DEFAULT: scalar.ph: 386; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 387; DEFAULT-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 388; DEFAULT-NEXT: br label [[LOOP:%.*]] 389; DEFAULT: loop: 390; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 391; DEFAULT-NEXT: [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] 392; DEFAULT-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] 393; DEFAULT-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 394; DEFAULT-NEXT: [[DIV:%.*]] = udiv i16 [[L]], [[X]] 395; DEFAULT-NEXT: [[RED_NEXT]] = or i16 [[DIV]], [[RED]] 396; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 397; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] 398; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] 399; DEFAULT: exit: 400; DEFAULT-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] 401; DEFAULT-NEXT: ret i16 [[RED_NEXT_LCSSA]] 402; 403; VSCALEFORTUNING2-LABEL: define i16 @reduce_udiv( 404; VSCALEFORTUNING2-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 405; VSCALEFORTUNING2-NEXT: entry: 406; VSCALEFORTUNING2-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 407; VSCALEFORTUNING2-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() 408; VSCALEFORTUNING2-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 8 409; VSCALEFORTUNING2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] 410; VSCALEFORTUNING2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 411; VSCALEFORTUNING2: vector.ph: 412; VSCALEFORTUNING2-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 413; VSCALEFORTUNING2-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 414; VSCALEFORTUNING2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] 415; VSCALEFORTUNING2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] 416; VSCALEFORTUNING2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() 417; VSCALEFORTUNING2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 418; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0 419; VSCALEFORTUNING2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer 420; VSCALEFORTUNING2-NEXT: br label [[VECTOR_BODY:%.*]] 421; VSCALEFORTUNING2: vector.body: 422; VSCALEFORTUNING2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 423; VSCALEFORTUNING2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] 424; VSCALEFORTUNING2-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] 425; VSCALEFORTUNING2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 426; VSCALEFORTUNING2-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP7]] 427; VSCALEFORTUNING2-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0 428; VSCALEFORTUNING2-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() 429; VSCALEFORTUNING2-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 430; VSCALEFORTUNING2-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i64 [[TMP11]] 431; VSCALEFORTUNING2-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP9]], align 2 432; VSCALEFORTUNING2-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i16>, ptr [[TMP12]], align 2 433; VSCALEFORTUNING2-NEXT: [[TMP13:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] 434; VSCALEFORTUNING2-NEXT: [[TMP14:%.*]] = udiv <vscale x 4 x i16> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]] 435; VSCALEFORTUNING2-NEXT: [[TMP15]] = or <vscale x 4 x i16> [[TMP13]], [[VEC_PHI]] 436; VSCALEFORTUNING2-NEXT: [[TMP16]] = or <vscale x 4 x i16> [[TMP14]], [[VEC_PHI1]] 437; VSCALEFORTUNING2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] 438; VSCALEFORTUNING2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 439; VSCALEFORTUNING2-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 440; VSCALEFORTUNING2: middle.block: 441; VSCALEFORTUNING2-NEXT: [[BIN_RDX:%.*]] = or <vscale x 4 x i16> [[TMP16]], [[TMP15]] 442; VSCALEFORTUNING2-NEXT: [[TMP18:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[BIN_RDX]]) 443; VSCALEFORTUNING2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] 444; VSCALEFORTUNING2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] 445; VSCALEFORTUNING2: scalar.ph: 446; VSCALEFORTUNING2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 447; VSCALEFORTUNING2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 448; VSCALEFORTUNING2-NEXT: br label [[LOOP:%.*]] 449; VSCALEFORTUNING2: loop: 450; VSCALEFORTUNING2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 451; VSCALEFORTUNING2-NEXT: [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] 452; VSCALEFORTUNING2-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] 453; VSCALEFORTUNING2-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 454; VSCALEFORTUNING2-NEXT: [[DIV:%.*]] = udiv i16 [[L]], [[X]] 455; VSCALEFORTUNING2-NEXT: [[RED_NEXT]] = or i16 [[DIV]], [[RED]] 456; VSCALEFORTUNING2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 457; VSCALEFORTUNING2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] 458; VSCALEFORTUNING2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] 459; VSCALEFORTUNING2: exit: 460; VSCALEFORTUNING2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] 461; VSCALEFORTUNING2-NEXT: ret i16 [[RED_NEXT_LCSSA]] 462; 463; PRED-LABEL: define i16 @reduce_udiv( 464; PRED-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 465; PRED-NEXT: entry: 466; PRED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 467; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 468; PRED: vector.ph: 469; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() 470; PRED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 471; PRED-NEXT: [[TMP5:%.*]] = sub i64 [[TMP2]], 1 472; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP5]] 473; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP2]] 474; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] 475; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() 476; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 477; PRED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() 478; PRED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 479; PRED-NEXT: [[TMP10:%.*]] = sub i64 [[TMP0]], [[TMP9]] 480; PRED-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], [[TMP9]] 481; PRED-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 482; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]]) 483; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[X]], i64 0 484; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer 485; PRED-NEXT: br label [[VECTOR_BODY:%.*]] 486; PRED: vector.body: 487; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 488; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] 489; PRED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] 490; PRED-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 491; PRED-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP13]] 492; PRED-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0 493; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16.p0(ptr [[TMP15]], i32 2, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> poison) 494; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i16> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] 495; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 4 x i16> [[TMP19]], [[VEC_PHI]] 496; PRED-NEXT: [[TMP16]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i16> [[TMP20]], <vscale x 4 x i16> [[VEC_PHI]] 497; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] 498; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) 499; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true) 500; PRED-NEXT: [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0 501; PRED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 502; PRED: middle.block: 503; PRED-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.nxv4i16(<vscale x 4 x i16> [[TMP16]]) 504; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] 505; PRED: scalar.ph: 506; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 507; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 508; PRED-NEXT: br label [[LOOP:%.*]] 509; PRED: loop: 510; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 511; PRED-NEXT: [[RED:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] 512; PRED-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]] 513; PRED-NEXT: [[L:%.*]] = load i16, ptr [[GEP]], align 2 514; PRED-NEXT: [[DIV:%.*]] = udiv i16 [[L]], [[X]] 515; PRED-NEXT: [[RED_NEXT]] = or i16 [[DIV]], [[RED]] 516; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 517; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] 518; PRED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] 519; PRED: exit: 520; PRED-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] 521; PRED-NEXT: ret i16 [[RED_NEXT_LCSSA]] 522; 523entry: 524 br label %loop 525 526loop: 527 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 528 %red = phi i16 [ 0, %entry ], [ %red.next, %loop ] 529 %gep = getelementptr i16, ptr %src, i64 %iv 530 %l = load i16, ptr %gep, align 2 531 %div = udiv i16 %l, %x 532 %red.next = or i16 %div, %red 533 %iv.next = add i64 %iv, 1 534 %ec = icmp eq i64 %iv, %N 535 br i1 %ec, label %exit, label %loop 536 537exit: 538 ret i16 %red.next 539} 540 541attributes #0 = { "target-features"="+sve" } 542;. 543; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} 544; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} 545; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} 546; DEFAULT: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} 547;. 548; VSCALEFORTUNING2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} 549; VSCALEFORTUNING2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} 550; VSCALEFORTUNING2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} 551; VSCALEFORTUNING2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} 552; VSCALEFORTUNING2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} 553; VSCALEFORTUNING2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} 554;. 555; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} 556; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} 557; PRED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} 558; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} 559; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} 560; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} 561;. 562