1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=instcombine -S | FileCheck %s 3 4; Fold zeroing of inactive lanes into the load's passthrough parameter. 5define <4 x float> @masked_load_and_zero_inactive_1(ptr %ptr, <4 x i1> %mask) { 6; CHECK-LABEL: @masked_load_and_zero_inactive_1( 7; CHECK-NEXT: [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x float> zeroinitializer) 8; CHECK-NEXT: ret <4 x float> [[LOAD]] 9; 10 %load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x float> undef) 11 %masked = select <4 x i1> %mask, <4 x float> %load, <4 x float> zeroinitializer 12 ret <4 x float> %masked 13} 14 15; As above but reuse the load's existing passthrough. 16define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) { 17; CHECK-LABEL: @masked_load_and_zero_inactive_2( 18; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer) 19; CHECK-NEXT: ret <4 x i32> [[LOAD]] 20; 21 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) 22 %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer 23 ret <4 x i32> %masked 24} 25 26; No transform when the load's passthrough cannot be reused or altered. 27define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthrough) { 28; CHECK-LABEL: @masked_load_and_zero_inactive_3( 29; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHROUGH:%.*]]) 30; CHECK-NEXT: [[MASKED:%.*]] = select <4 x i1> [[MASK]], <4 x i32> [[LOAD]], <4 x i32> zeroinitializer 31; CHECK-NEXT: ret <4 x i32> [[MASKED]] 32; 33 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) 34 %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer 35 ret <4 x i32> %masked 36} 37 38; Remove redundant select when its mask doesn't overlap with the load mask. 39define <4 x i32> @masked_load_and_zero_inactive_4(ptr %ptr, <4 x i1> %inv_mask) { 40; CHECK-LABEL: @masked_load_and_zero_inactive_4( 41; CHECK-NEXT: [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true) 42; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer) 43; CHECK-NEXT: ret <4 x i32> [[LOAD]] 44; 45 %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true> 46 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> undef) 47 %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load 48 ret <4 x i32> %masked 49} 50 51; As above but reuse the load's existing passthrough. 52define <4 x i32> @masked_load_and_zero_inactive_5(ptr %ptr, <4 x i1> %inv_mask) { 53; CHECK-LABEL: @masked_load_and_zero_inactive_5( 54; CHECK-NEXT: [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true) 55; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> zeroinitializer) 56; CHECK-NEXT: ret <4 x i32> [[LOAD]] 57; 58 %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true> 59 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) 60 %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load 61 ret <4 x i32> %masked 62} 63 64; No transform when the load's passthrough cannot be reused or altered. 65define <4 x i32> @masked_load_and_zero_inactive_6(ptr %ptr, <4 x i1> %inv_mask, <4 x i32> %passthrough) { 66; CHECK-LABEL: @masked_load_and_zero_inactive_6( 67; CHECK-NEXT: [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true) 68; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK]], <4 x i32> [[PASSTHROUGH:%.*]]) 69; CHECK-NEXT: [[MASKED:%.*]] = select <4 x i1> [[INV_MASK]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]] 70; CHECK-NEXT: ret <4 x i32> [[MASKED]] 71; 72 %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true> 73 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) 74 %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load 75 ret <4 x i32> %masked 76} 77 78; No transform when select and load masks have no relation. 79define <4 x i32> @masked_load_and_zero_inactive_7(ptr %ptr, <4 x i1> %mask1, <4 x i1> %mask2) { 80; CHECK-LABEL: @masked_load_and_zero_inactive_7( 81; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK1:%.*]], <4 x i32> zeroinitializer) 82; CHECK-NEXT: [[MASKED:%.*]] = select <4 x i1> [[MASK2:%.*]], <4 x i32> zeroinitializer, <4 x i32> [[LOAD]] 83; CHECK-NEXT: ret <4 x i32> [[MASKED]] 84; 85 %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask1, <4 x i32> zeroinitializer) 86 %masked = select <4 x i1> %mask2, <4 x i32> zeroinitializer, <4 x i32> %load 87 ret <4 x i32> %masked 88} 89 90; A more complex case where we can prove the select mask is a subset of the 91; load's inactive lanes and thus the load's passthrough takes effect. 92define <4 x float> @masked_load_and_zero_inactive_8(ptr %ptr, <4 x i1> %inv_mask, <4 x i1> %cond) { 93; CHECK-LABEL: @masked_load_and_zero_inactive_8( 94; CHECK-NEXT: [[MASK:%.*]] = xor <4 x i1> [[INV_MASK:%.*]], splat (i1 true) 95; CHECK-NEXT: [[PG:%.*]] = and <4 x i1> [[COND:%.*]], [[MASK]] 96; CHECK-NEXT: [[LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[PG]], <4 x float> zeroinitializer) 97; CHECK-NEXT: ret <4 x float> [[LOAD]] 98; 99 %mask = xor <4 x i1> %inv_mask, <i1 true, i1 true, i1 true, i1 true> 100 %pg = and <4 x i1> %mask, %cond 101 %load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %ptr, i32 4, <4 x i1> %pg, <4 x float> undef) 102 %masked = select <4 x i1> %inv_mask, <4 x float> zeroinitializer, <4 x float> %load 103 ret <4 x float> %masked 104} 105 106define <8 x float> @masked_load_and_scalar_select_cond(ptr %ptr, <8 x i1> %mask, i1 %cond) { 107; CHECK-LABEL: @masked_load_and_scalar_select_cond( 108; CHECK-NEXT: entry: 109; CHECK-NEXT: [[TMP0:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[PTR:%.*]], i32 32, <8 x i1> [[MASK:%.*]], <8 x float> undef) 110; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[COND:%.*]], <8 x float> zeroinitializer, <8 x float> [[TMP0]] 111; CHECK-NEXT: ret <8 x float> [[TMP1]] 112; 113entry: 114 %0 = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %ptr, i32 32, <8 x i1> %mask, <8 x float> undef) 115 %1 = select i1 %cond, <8 x float> zeroinitializer, <8 x float> %0 116 ret <8 x float> %1 117} 118 119declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32 immarg, <8 x i1>, <8 x float>) 120declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) 121declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) 122