1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; Masked Stores 10; 11 12define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { 13; CHECK-LABEL: masked_store_v2f16: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ldr s1, [x0] 16; CHECK-NEXT: ldr s2, [x1] 17; CHECK-NEXT: movi v0.2d, #0000000000000000 18; CHECK-NEXT: ptrue p0.h, vl4 19; CHECK-NEXT: fcmeq v2.4h, v1.4h, v2.4h 20; CHECK-NEXT: sshll v2.4s, v2.4h, #0 21; CHECK-NEXT: mov v0.h[0], v2.h[0] 22; CHECK-NEXT: mov w8, v2.s[1] 23; CHECK-NEXT: mov v0.h[1], w8 24; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 25; CHECK-NEXT: st1h { z1.h }, p0, [x1] 26; CHECK-NEXT: ret 27 %a = load <2 x half>, ptr %ap 28 %b = load <2 x half>, ptr %bp 29 %mask = fcmp oeq <2 x half> %a, %b 30 call void @llvm.masked.store.v2f16(<2 x half> %a, ptr %bp, i32 8, <2 x i1> %mask) 31 ret void 32} 33 34define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { 35; CHECK-LABEL: masked_store_v2f32: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ldr d0, [x0] 38; CHECK-NEXT: ldr d1, [x1] 39; CHECK-NEXT: ptrue p0.s, vl2 40; CHECK-NEXT: fcmeq v1.2s, v0.2s, v1.2s 41; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 42; CHECK-NEXT: st1w { z0.s }, p0, [x1] 43; CHECK-NEXT: ret 44 %a = load <2 x float>, ptr %ap 45 %b = load <2 x float>, ptr %bp 46 %mask = fcmp oeq <2 x float> %a, %b 47 call void @llvm.masked.store.v2f32(<2 x float> %a, ptr %bp, i32 8, <2 x i1> %mask) 48 ret void 49} 50 51define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { 52; CHECK-LABEL: masked_store_v4f32: 53; CHECK: // %bb.0: 54; CHECK-NEXT: ldr q0, [x0] 55; CHECK-NEXT: ldr q1, [x1] 56; CHECK-NEXT: ptrue p0.s, vl4 57; CHECK-NEXT: fcmeq v1.4s, v0.4s, v1.4s 58; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 59; CHECK-NEXT: st1w { z0.s }, p0, [x1] 60; CHECK-NEXT: ret 61 %a = load <4 x float>, ptr %ap 62 %b = load <4 x float>, ptr %bp 63 %mask = fcmp oeq <4 x float> %a, %b 64 call void @llvm.masked.store.v4f32(<4 x float> %a, ptr %bp, i32 8, <4 x i1> %mask) 65 ret void 66} 67 68define void @masked_store_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { 69; CHECK-LABEL: masked_store_v8f32: 70; CHECK: // %bb.0: 71; CHECK-NEXT: ptrue p0.s, vl8 72; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 73; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 74; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s 75; CHECK-NEXT: st1w { z0.s }, p0, [x1] 76; CHECK-NEXT: ret 77 %a = load <8 x float>, ptr %ap 78 %b = load <8 x float>, ptr %bp 79 %mask = fcmp oeq <8 x float> %a, %b 80 call void @llvm.masked.store.v8f32(<8 x float> %a, ptr %bp, i32 8, <8 x i1> %mask) 81 ret void 82} 83 84define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 { 85; VBITS_GE_256-LABEL: masked_store_v16f32: 86; VBITS_GE_256: // %bb.0: 87; VBITS_GE_256-NEXT: ptrue p0.s, vl8 88; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 89; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 90; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 91; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 92; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 93; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 94; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s 95; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] 96; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] 97; VBITS_GE_256-NEXT: ret 98; 99; VBITS_GE_512-LABEL: masked_store_v16f32: 100; VBITS_GE_512: // %bb.0: 101; VBITS_GE_512-NEXT: ptrue p0.s, vl16 102; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 103; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 104; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s 105; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 106; VBITS_GE_512-NEXT: ret 107 %a = load <16 x float>, ptr %ap 108 %b = load <16 x float>, ptr %bp 109 %mask = fcmp oeq <16 x float> %a, %b 110 call void @llvm.masked.store.v16f32(<16 x float> %a, ptr %ap, i32 8, <16 x i1> %mask) 111 ret void 112} 113 114define void @masked_store_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 { 115; CHECK-LABEL: masked_store_v32f32: 116; CHECK: // %bb.0: 117; CHECK-NEXT: ptrue p0.s, vl32 118; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 119; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 120; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s 121; CHECK-NEXT: st1w { z0.s }, p0, [x0] 122; CHECK-NEXT: ret 123 %a = load <32 x float>, ptr %ap 124 %b = load <32 x float>, ptr %bp 125 %mask = fcmp oeq <32 x float> %a, %b 126 call void @llvm.masked.store.v32f32(<32 x float> %a, ptr %ap, i32 8, <32 x i1> %mask) 127 ret void 128} 129 130define void @masked_store_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { 131; CHECK-LABEL: masked_store_v64f32: 132; CHECK: // %bb.0: 133; CHECK-NEXT: ptrue p0.s, vl64 134; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 135; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 136; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s 137; CHECK-NEXT: st1w { z0.s }, p0, [x0] 138; CHECK-NEXT: ret 139 %a = load <64 x float>, ptr %ap 140 %b = load <64 x float>, ptr %bp 141 %mask = fcmp oeq <64 x float> %a, %b 142 call void @llvm.masked.store.v64f32(<64 x float> %a, ptr %ap, i32 8, <64 x i1> %mask) 143 ret void 144} 145 146define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { 147; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i8: 148; VBITS_GE_256: // %bb.0: 149; VBITS_GE_256-NEXT: ptrue p0.d, vl4 150; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 151; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 152; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 153; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] 154; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 155; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d 156; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 157; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d 158; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 159; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff 160; VBITS_GE_256-NEXT: ptrue p1.s, vl8 161; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff 162; VBITS_GE_256-NEXT: ptrue p0.s, vl4 163; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s 164; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s 165; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s 166; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s 167; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 168; VBITS_GE_256-NEXT: st1b { z1.s }, p1, [x2] 169; VBITS_GE_256-NEXT: ret 170; 171; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: 172; VBITS_GE_512: // %bb.0: 173; VBITS_GE_512-NEXT: ptrue p0.d, vl8 174; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 175; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 176; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d 177; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] 178; VBITS_GE_512-NEXT: ret 179 %a = load <8 x i64>, ptr %ap 180 %b = load <8 x i64>, ptr %bp 181 %mask = icmp eq <8 x i64> %a, %b 182 %val = trunc <8 x i64> %a to <8 x i8> 183 call void @llvm.masked.store.v8i8(<8 x i8> %val, ptr %dest, i32 8, <8 x i1> %mask) 184 ret void 185} 186 187define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { 188; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i16: 189; VBITS_GE_256: // %bb.0: 190; VBITS_GE_256-NEXT: ptrue p0.d, vl4 191; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 192; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 193; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 194; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] 195; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 196; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d 197; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 198; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d 199; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 200; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 201; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff 202; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 203; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff 204; VBITS_GE_256-NEXT: ptrue p0.s, vl4 205; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s 206; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 207; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s 208; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s 209; VBITS_GE_256-NEXT: ptrue p0.h, vl8 210; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h 211; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 212; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 213; VBITS_GE_256-NEXT: ret 214; 215; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16: 216; VBITS_GE_512: // %bb.0: 217; VBITS_GE_512-NEXT: ptrue p0.d, vl8 218; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 219; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 220; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d 221; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x2] 222; VBITS_GE_512-NEXT: ret 223 %a = load <8 x i64>, ptr %ap 224 %b = load <8 x i64>, ptr %bp 225 %mask = icmp eq <8 x i64> %a, %b 226 %val = trunc <8 x i64> %a to <8 x i16> 227 call void @llvm.masked.store.v8i16(<8 x i16> %val, ptr %dest, i32 8, <8 x i1> %mask) 228 ret void 229} 230 231define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { 232; VBITS_GE_256-LABEL: masked_store_trunc_v8i64i32: 233; VBITS_GE_256: // %bb.0: 234; VBITS_GE_256-NEXT: ptrue p0.d, vl4 235; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 236; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 237; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 238; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] 239; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 240; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z2.d 241; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 242; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d 243; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 244; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff 245; VBITS_GE_256-NEXT: ptrue p1.s, vl8 246; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff 247; VBITS_GE_256-NEXT: ptrue p0.s, vl4 248; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s 249; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s 250; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s 251; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s 252; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 253; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x2] 254; VBITS_GE_256-NEXT: ret 255; 256; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: 257; VBITS_GE_512: // %bb.0: 258; VBITS_GE_512-NEXT: ptrue p0.d, vl8 259; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 260; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 261; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d 262; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x2] 263; VBITS_GE_512-NEXT: ret 264 %a = load <8 x i64>, ptr %ap 265 %b = load <8 x i64>, ptr %bp 266 %mask = icmp eq <8 x i64> %a, %b 267 %val = trunc <8 x i64> %a to <8 x i32> 268 call void @llvm.masked.store.v8i32(<8 x i32> %val, ptr %dest, i32 8, <8 x i1> %mask) 269 ret void 270} 271 272define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { 273; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i8: 274; VBITS_GE_256: // %bb.0: 275; VBITS_GE_256-NEXT: ptrue p0.s, vl8 276; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 277; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 278; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 279; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] 280; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 281; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s 282; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 283; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s 284; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 285; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 286; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff 287; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 288; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff 289; VBITS_GE_256-NEXT: ptrue p0.b, vl16 290; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 291; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 292; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h 293; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b 294; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b 295; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] 296; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z3.b, #0 297; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] 298; VBITS_GE_256-NEXT: ret 299; 300; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8: 301; VBITS_GE_512: // %bb.0: 302; VBITS_GE_512-NEXT: ptrue p0.s, vl16 303; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 304; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 305; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 306; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x2] 307; VBITS_GE_512-NEXT: ret 308 %a = load <16 x i32>, ptr %ap 309 %b = load <16 x i32>, ptr %bp 310 %mask = icmp eq <16 x i32> %a, %b 311 %val = trunc <16 x i32> %a to <16 x i8> 312 call void @llvm.masked.store.v16i8(<16 x i8> %val, ptr %dest, i32 8, <16 x i1> %mask) 313 ret void 314} 315 316define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { 317; VBITS_GE_256-LABEL: masked_store_trunc_v16i32i16: 318; VBITS_GE_256: // %bb.0: 319; VBITS_GE_256-NEXT: ptrue p0.s, vl8 320; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 321; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 322; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 323; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] 324; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 325; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z2.s 326; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 327; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s 328; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 329; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff 330; VBITS_GE_256-NEXT: ptrue p1.h, vl8 331; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff 332; VBITS_GE_256-NEXT: ptrue p0.h, vl16 333; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h 334; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h 335; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h 336; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b 337; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b 338; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] 339; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b 340; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 341; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 342; VBITS_GE_256-NEXT: ret 343; 344; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16: 345; VBITS_GE_512: // %bb.0: 346; VBITS_GE_512-NEXT: ptrue p0.s, vl16 347; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 348; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 349; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 350; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x2] 351; VBITS_GE_512-NEXT: ret 352 %a = load <16 x i32>, ptr %ap 353 %b = load <16 x i32>, ptr %bp 354 %mask = icmp eq <16 x i32> %a, %b 355 %val = trunc <16 x i32> %a to <16 x i16> 356 call void @llvm.masked.store.v16i16(<16 x i16> %val, ptr %dest, i32 8, <16 x i1> %mask) 357 ret void 358} 359 360define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { 361; VBITS_GE_256-LABEL: masked_store_trunc_v32i16i8: 362; VBITS_GE_256: // %bb.0: 363; VBITS_GE_256-NEXT: ptrue p0.h, vl16 364; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 365; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 366; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 367; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1] 368; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 369; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z2.h 370; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 371; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z1.h, z3.h 372; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 373; VBITS_GE_256-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff 374; VBITS_GE_256-NEXT: ptrue p1.b, vl32 375; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff 376; VBITS_GE_256-NEXT: ptrue p0.b, vl16 377; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b 378; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b 379; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b 380; VBITS_GE_256-NEXT: splice z3.b, p0, z3.b, z2.b 381; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z3.b, #0 382; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x2] 383; VBITS_GE_256-NEXT: ret 384; 385; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: 386; VBITS_GE_512: // %bb.0: 387; VBITS_GE_512-NEXT: ptrue p0.h, vl32 388; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 389; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 390; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h 391; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x2] 392; VBITS_GE_512-NEXT: ret 393 %a = load <32 x i16>, ptr %ap 394 %b = load <32 x i16>, ptr %bp 395 %mask = icmp eq <32 x i16> %a, %b 396 %val = trunc <32 x i16> %a to <32 x i8> 397 call void @llvm.masked.store.v32i8(<32 x i8> %val, ptr %dest, i32 8, <32 x i1> %mask) 398 ret void 399} 400 401declare void @llvm.masked.store.v2f16(<2 x half>, ptr, i32, <2 x i1>) 402declare void @llvm.masked.store.v2f32(<2 x float>, ptr, i32, <2 x i1>) 403declare void @llvm.masked.store.v4f32(<4 x float>, ptr, i32, <4 x i1>) 404declare void @llvm.masked.store.v8f32(<8 x float>, ptr, i32, <8 x i1>) 405declare void @llvm.masked.store.v16f32(<16 x float>, ptr, i32, <16 x i1>) 406declare void @llvm.masked.store.v32f32(<32 x float>, ptr, i32, <32 x i1>) 407declare void @llvm.masked.store.v64f32(<64 x float>, ptr, i32, <64 x i1>) 408 409declare void @llvm.masked.store.v8i8(<8 x i8>, ptr, i32, <8 x i1>) 410declare void @llvm.masked.store.v8i16(<8 x i16>, ptr, i32, <8 x i1>) 411declare void @llvm.masked.store.v8i32(<8 x i32>, ptr, i32, <8 x i1>) 412declare void @llvm.masked.store.v16i8(<16 x i8>, ptr, i32, <16 x i1>) 413declare void @llvm.masked.store.v16i16(<16 x i16>, ptr, i32, <16 x i1>) 414declare void @llvm.masked.store.v32i8(<32 x i8>, ptr, i32, <32 x i1>) 415 416attributes #0 = { "target-features"="+sve" } 417