1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-enable-mgather-combine=0 -enable-misched=false < %s | FileCheck %s 3; RUN: llc -aarch64-enable-mgather-combine=1 -enable-misched=false < %s | FileCheck %s 4 5target triple = "aarch64-linux-gnu" 6 7; Test for multiple uses of the mgather where the s/zext should not be combined 8 9define <vscale x 2 x i64> @masked_sgather_sext(ptr %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) #0 { 10; CHECK-LABEL: masked_sgather_sext: 11; CHECK: // %bb.0: 12; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] 13; CHECK-NEXT: ptrue p0.d 14; CHECK-NEXT: movprfx z2, z0 15; CHECK-NEXT: sxtb z2.d, p0/m, z0.d 16; CHECK-NEXT: add z0.d, z0.d, z1.d 17; CHECK-NEXT: sxtb z0.d, p0/m, z0.d 18; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d 19; CHECK-NEXT: ret 20 %ptrs = getelementptr i8, ptr %base, <vscale x 2 x i64> %offsets 21 %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x ptr> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 22 %data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64> 23 %add = add <vscale x 2 x i8> %data, %vals 24 %add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64> 25 %mul = mul <vscale x 2 x i64> %data.sext, %add.sext 26 ret <vscale x 2 x i64> %mul 27} 28 29define <vscale x 2 x i64> @masked_sgather_zext(ptr %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) #0 { 30; CHECK-LABEL: masked_sgather_zext: 31; CHECK: // %bb.0: 32; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] 33; CHECK-NEXT: ptrue p0.d 34; CHECK-NEXT: add z1.d, z0.d, z1.d 35; CHECK-NEXT: and z0.d, z0.d, #0xff 36; CHECK-NEXT: and z1.d, z1.d, #0xff 37; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d 38; CHECK-NEXT: ret 39 %ptrs = getelementptr i8, ptr %base, <vscale x 2 x i64> %offsets 40 %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x ptr> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 41 %data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64> 42 %add = add <vscale x 2 x i8> %data, %vals 43 %add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64> 44 %mul = mul <vscale x 2 x i64> %data.zext, %add.zext 45 ret <vscale x 2 x i64> %mul 46} 47 48; Tests that exercise various type legalisation scenarios for ISD::MGATHER. 49 50; Code generate load of an illegal datatype via promotion. 51define <vscale x 2 x i8> @masked_gather_nxv2i8(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %mask) #0 { 52; CHECK-LABEL: masked_gather_nxv2i8: 53; CHECK: // %bb.0: 54; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] 55; CHECK-NEXT: ret 56 %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x ptr> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 57 ret <vscale x 2 x i8> %data 58} 59 60; Code generate load of an illegal datatype via promotion. 61define <vscale x 2 x i16> @masked_gather_nxv2i16(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %mask) #0 { 62; CHECK-LABEL: masked_gather_nxv2i16: 63; CHECK: // %bb.0: 64; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 65; CHECK-NEXT: ret 66 %data = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x ptr> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef) 67 ret <vscale x 2 x i16> %data 68} 69 70; Code generate load of an illegal datatype via promotion. 71define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x ptr> %ptrs, <vscale x 2 x i1> %mask) #0 { 72; CHECK-LABEL: masked_gather_nxv2i32: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 75; CHECK-NEXT: ret 76 %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x ptr> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef) 77 ret <vscale x 2 x i32> %data 78} 79 80define <vscale x 4 x half> @masked_gather_nxv4f16(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %mask) #0 { 81; CHECK-LABEL: masked_gather_nxv4f16: 82; CHECK: // %bb.0: 83; CHECK-NEXT: punpkhi p1.h, p0.b 84; CHECK-NEXT: punpklo p0.h, p0.b 85; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d] 86; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 87; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 88; CHECK-NEXT: ret 89 %data = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x ptr> %ptrs, i32 0, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef) 90 ret <vscale x 4 x half> %data 91} 92 93define <vscale x 2 x float> @masked_gather_nxv2f32(ptr %base, <vscale x 2 x i16> %indices, <vscale x 2 x i1> %mask) #0 { 94; CHECK-LABEL: masked_gather_nxv2f32: 95; CHECK: // %bb.0: 96; CHECK-NEXT: ptrue p1.d 97; CHECK-NEXT: sxth z0.d, p1/m, z0.d 98; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] 99; CHECK-NEXT: ret 100 %ptrs = getelementptr float, ptr %base, <vscale x 2 x i16> %indices 101 %data = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef) 102 ret <vscale x 2 x float> %data 103} 104 105define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x ptr> %ptrs, <vscale x 8 x i1> %mask) #0 { 106; CHECK-LABEL: masked_gather_nxv8f16: 107; CHECK: // %bb.0: 108; CHECK-NEXT: punpkhi p1.h, p0.b 109; CHECK-NEXT: punpklo p0.h, p0.b 110; CHECK-NEXT: punpkhi p2.h, p1.b 111; CHECK-NEXT: punpklo p1.h, p1.b 112; CHECK-NEXT: ld1h { z3.d }, p2/z, [z3.d] 113; CHECK-NEXT: ld1h { z2.d }, p1/z, [z2.d] 114; CHECK-NEXT: punpkhi p1.h, p0.b 115; CHECK-NEXT: punpklo p0.h, p0.b 116; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d] 117; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s 118; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 119; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 120; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h 121; CHECK-NEXT: ret 122 %data = call <vscale x 8 x half> @llvm.masked.gather.nxv8f16(<vscale x 8 x ptr> %ptrs, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x half> undef) 123 ret <vscale x 8 x half> %data 124} 125 126define <vscale x 8 x bfloat> @masked_gather_nxv8bf16(ptr %base, <vscale x 8 x i16> %indices, <vscale x 8 x i1> %mask) #0 { 127; CHECK-LABEL: masked_gather_nxv8bf16: 128; CHECK: // %bb.0: 129; CHECK-NEXT: sunpkhi z1.s, z0.h 130; CHECK-NEXT: sunpklo z0.s, z0.h 131; CHECK-NEXT: punpkhi p1.h, p0.b 132; CHECK-NEXT: punpklo p0.h, p0.b 133; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1] 134; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] 135; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h 136; CHECK-NEXT: ret 137 %ptrs = getelementptr bfloat, ptr %base, <vscale x 8 x i16> %indices 138 %data = call <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16(<vscale x 8 x ptr> %ptrs, i32 1, <vscale x 8 x i1> %mask, <vscale x 8 x bfloat> undef) 139 ret <vscale x 8 x bfloat> %data 140} 141 142define <vscale x 4 x double> @masked_gather_nxv4f64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 { 143; CHECK-LABEL: masked_gather_nxv4f64: 144; CHECK: // %bb.0: 145; CHECK-NEXT: ptrue p1.s 146; CHECK-NEXT: movprfx z1, z0 147; CHECK-NEXT: sxth z1.s, p1/m, z0.s 148; CHECK-NEXT: punpklo p1.h, p0.b 149; CHECK-NEXT: punpkhi p0.h, p0.b 150; CHECK-NEXT: sunpklo z0.d, z1.s 151; CHECK-NEXT: sunpkhi z1.d, z1.s 152; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, z0.d, lsl #3] 153; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, z1.d, lsl #3] 154; CHECK-NEXT: ret 155 %ptrs = getelementptr double, ptr %base, <vscale x 4 x i16> %indices 156 %data = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x double> undef) 157 ret <vscale x 4 x double> %data 158} 159 160define <vscale x 8 x float> @masked_gather_nxv8f32(ptr %base, <vscale x 8 x i32> %offsets, <vscale x 8 x i1> %mask) #0 { 161; CHECK-LABEL: masked_gather_nxv8f32: 162; CHECK: // %bb.0: 163; CHECK-NEXT: punpklo p1.h, p0.b 164; CHECK-NEXT: punpkhi p0.h, p0.b 165; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, z0.s, uxtw #2] 166; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, z1.s, uxtw #2] 167; CHECK-NEXT: ret 168 %offsets.zext = zext <vscale x 8 x i32> %offsets to <vscale x 8 x i64> 169 %ptrs = getelementptr float, ptr %base, <vscale x 8 x i64> %offsets.zext 170 %vals = call <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr> %ptrs, i32 4, <vscale x 8 x i1> %mask, <vscale x 8 x float> undef) 171 ret <vscale x 8 x float> %vals 172} 173 174; Code generate the worst case scenario when all vector types are legal. 175define <vscale x 16 x i8> @masked_gather_nxv16i8(ptr %base, <vscale x 16 x i8> %indices, <vscale x 16 x i1> %mask) #0 { 176; CHECK-LABEL: masked_gather_nxv16i8: 177; CHECK: // %bb.0: 178; CHECK-NEXT: sunpkhi z1.h, z0.b 179; CHECK-NEXT: punpkhi p1.h, p0.b 180; CHECK-NEXT: sunpklo z0.h, z0.b 181; CHECK-NEXT: punpklo p0.h, p0.b 182; CHECK-NEXT: punpkhi p2.h, p1.b 183; CHECK-NEXT: punpklo p1.h, p1.b 184; CHECK-NEXT: sunpkhi z2.s, z1.h 185; CHECK-NEXT: sunpklo z1.s, z1.h 186; CHECK-NEXT: ld1b { z2.s }, p2/z, [x0, z2.s, sxtw] 187; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, z1.s, sxtw] 188; CHECK-NEXT: punpkhi p1.h, p0.b 189; CHECK-NEXT: punpklo p0.h, p0.b 190; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h 191; CHECK-NEXT: sunpkhi z2.s, z0.h 192; CHECK-NEXT: sunpklo z0.s, z0.h 193; CHECK-NEXT: ld1b { z2.s }, p1/z, [x0, z2.s, sxtw] 194; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] 195; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h 196; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b 197; CHECK-NEXT: ret 198 %ptrs = getelementptr i8, ptr %base, <vscale x 16 x i8> %indices 199 %data = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x ptr> %ptrs, i32 1, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef) 200 ret <vscale x 16 x i8> %data 201} 202 203; Code generate the worst case scenario when all vector types are illegal. 204define <vscale x 32 x i32> @masked_gather_nxv32i32(ptr %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) #0 { 205; CHECK-LABEL: masked_gather_nxv32i32: 206; CHECK: // %bb.0: 207; CHECK-NEXT: punpklo p2.h, p0.b 208; CHECK-NEXT: punpkhi p0.h, p0.b 209; CHECK-NEXT: punpklo p3.h, p2.b 210; CHECK-NEXT: punpkhi p2.h, p2.b 211; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, z0.s, sxtw #2] 212; CHECK-NEXT: ld1w { z1.s }, p2/z, [x0, z1.s, sxtw #2] 213; CHECK-NEXT: punpklo p2.h, p0.b 214; CHECK-NEXT: punpkhi p0.h, p0.b 215; CHECK-NEXT: ld1w { z2.s }, p2/z, [x0, z2.s, sxtw #2] 216; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, z3.s, sxtw #2] 217; CHECK-NEXT: punpklo p0.h, p1.b 218; CHECK-NEXT: punpklo p2.h, p0.b 219; CHECK-NEXT: punpkhi p0.h, p0.b 220; CHECK-NEXT: ld1w { z4.s }, p2/z, [x0, z4.s, sxtw #2] 221; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, z5.s, sxtw #2] 222; CHECK-NEXT: punpkhi p0.h, p1.b 223; CHECK-NEXT: punpklo p1.h, p0.b 224; CHECK-NEXT: punpkhi p0.h, p0.b 225; CHECK-NEXT: ld1w { z6.s }, p1/z, [x0, z6.s, sxtw #2] 226; CHECK-NEXT: ld1w { z7.s }, p0/z, [x0, z7.s, sxtw #2] 227; CHECK-NEXT: ret 228 %ptrs = getelementptr i32, ptr %base, <vscale x 32 x i32> %indices 229 %data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x ptr> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef) 230 ret <vscale x 32 x i32> %data 231} 232 233; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two 234; registers, so it doesn't get folded away. Same for any other vector-of-pointers 235; style gathers which don't fit in an <vscale x 2 x ptr> single register. Better folding 236; is required before we can check those off. 237define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x ptr> %ptrs, <vscale x 4 x i1> %mask) #0 { 238; CHECK-LABEL: masked_sgather_nxv4i8: 239; CHECK: // %bb.0: 240; CHECK-NEXT: punpkhi p1.h, p0.b 241; CHECK-NEXT: punpklo p0.h, p0.b 242; CHECK-NEXT: ld1b { z1.d }, p1/z, [z1.d] 243; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] 244; CHECK-NEXT: ptrue p0.s 245; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s 246; CHECK-NEXT: sxtb z0.s, p0/m, z0.s 247; CHECK-NEXT: ret 248 %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x ptr> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef) 249 %svals = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32> 250 ret <vscale x 4 x i32> %svals 251} 252 253attributes #0 = { nounwind "target-features"="+sve,+bf16" } 254 255declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>) 256declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>) 257declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>) 258declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>) 259declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x ptr>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>) 260declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x ptr>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>) 261 262declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x half>) 263declare <vscale x 8 x half> @llvm.masked.gather.nxv8f16(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x half>) 264declare <vscale x 8 x bfloat> @llvm.masked.gather.nxv8bf16(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>) 265declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr>, i32, <vscale x 2 x i1>, <vscale x 2 x float>) 266declare <vscale x 8 x float> @llvm.masked.gather.nxv8f32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x float>) 267declare <vscale x 4 x double> @llvm.masked.gather.nxv4f64(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x double>) 268