1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s 3 4; These test should allow scheduling of the loads before the stores. 5 6define void @scalable_v16i8(ptr noalias nocapture noundef %l0) { 7; CHECK-LABEL: scalable_v16i8: 8; CHECK: // %bb.0: 9; CHECK-NEXT: ptrue p0.b 10; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 11; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] 12; CHECK-NEXT: movprfx z2, z0 13; CHECK-NEXT: mul z2.b, p0/m, z2.b, z0.b 14; CHECK-NEXT: movprfx z3, z1 15; CHECK-NEXT: mul z3.b, p0/m, z3.b, z1.b 16; CHECK-NEXT: eor z0.d, z2.d, z0.d 17; CHECK-NEXT: eor z1.d, z3.d, z1.d 18; CHECK-NEXT: st1b { z0.b }, p0, [x0] 19; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl] 20; CHECK-NEXT: ret 21 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16 22 %l5 = mul <vscale x 16 x i8> %l3, %l3 23 %l6 = xor <vscale x 16 x i8> %l5, %l3 24 store <vscale x 16 x i8> %l6, ptr %l0, align 16 25 %l7 = tail call i64 @llvm.vscale.i64() 26 %l8 = shl nuw nsw i64 %l7, 4 27 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 28 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16 29 %l13 = mul <vscale x 16 x i8> %l11, %l11 30 %l14 = xor <vscale x 16 x i8> %l13, %l11 31 store <vscale x 16 x i8> %l14, ptr %l9, align 16 32 ret void 33} 34 35define void @scalable_v8i16(ptr noalias nocapture noundef %l0) { 36; CHECK-LABEL: scalable_v8i16: 37; CHECK: // %bb.0: 38; CHECK-NEXT: ptrue p0.h 39; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 40; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] 41; CHECK-NEXT: movprfx z2, z0 42; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h 43; CHECK-NEXT: movprfx z3, z1 44; CHECK-NEXT: mul z3.h, p0/m, z3.h, z1.h 45; CHECK-NEXT: eor z0.d, z2.d, z0.d 46; CHECK-NEXT: eor z1.d, z3.d, z1.d 47; CHECK-NEXT: st1h { z0.h }, p0, [x0] 48; CHECK-NEXT: st1h { z1.h }, p0, [x0, #1, mul vl] 49; CHECK-NEXT: ret 50 %l3 = load <vscale x 8 x i16>, ptr %l0, align 16 51 %l5 = mul <vscale x 8 x i16> %l3, %l3 52 %l6 = xor <vscale x 8 x i16> %l5, %l3 53 store <vscale x 8 x i16> %l6, ptr %l0, align 16 54 %l7 = tail call i64 @llvm.vscale.i64() 55 %l8 = shl nuw nsw i64 %l7, 4 56 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 57 %l11 = load <vscale x 8 x i16>, ptr %l9, align 16 58 %l13 = mul <vscale x 8 x i16> %l11, %l11 59 %l14 = xor <vscale x 8 x i16> %l13, %l11 60 store <vscale x 8 x i16> %l14, ptr %l9, align 16 61 ret void 62} 63 64define void @scalable_v4i32(ptr noalias nocapture noundef %l0) { 65; CHECK-LABEL: scalable_v4i32: 66; CHECK: // %bb.0: 67; CHECK-NEXT: ptrue p0.s 68; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 69; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] 70; CHECK-NEXT: movprfx z2, z0 71; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s 72; CHECK-NEXT: movprfx z3, z1 73; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s 74; CHECK-NEXT: eor z0.d, z2.d, z0.d 75; CHECK-NEXT: eor z1.d, z3.d, z1.d 76; CHECK-NEXT: st1w { z0.s }, p0, [x0] 77; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl] 78; CHECK-NEXT: ret 79 %l3 = load <vscale x 4 x i32>, ptr %l0, align 16 80 %l5 = mul <vscale x 4 x i32> %l3, %l3 81 %l6 = xor <vscale x 4 x i32> %l5, %l3 82 store <vscale x 4 x i32> %l6, ptr %l0, align 16 83 %l7 = tail call i64 @llvm.vscale.i64() 84 %l8 = shl nuw nsw i64 %l7, 4 85 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 86 %l11 = load <vscale x 4 x i32>, ptr %l9, align 16 87 %l13 = mul <vscale x 4 x i32> %l11, %l11 88 %l14 = xor <vscale x 4 x i32> %l13, %l11 89 store <vscale x 4 x i32> %l14, ptr %l9, align 16 90 ret void 91} 92 93define void @scalable_v2i64(ptr noalias nocapture noundef %l0) { 94; CHECK-LABEL: scalable_v2i64: 95; CHECK: // %bb.0: 96; CHECK-NEXT: ptrue p0.d 97; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 98; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl] 99; CHECK-NEXT: movprfx z2, z0 100; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d 101; CHECK-NEXT: movprfx z3, z1 102; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d 103; CHECK-NEXT: eor z0.d, z2.d, z0.d 104; CHECK-NEXT: eor z1.d, z3.d, z1.d 105; CHECK-NEXT: st1d { z0.d }, p0, [x0] 106; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl] 107; CHECK-NEXT: ret 108 %l3 = load <vscale x 2 x i64>, ptr %l0, align 16 109 %l5 = mul <vscale x 2 x i64> %l3, %l3 110 %l6 = xor <vscale x 2 x i64> %l5, %l3 111 store <vscale x 2 x i64> %l6, ptr %l0, align 16 112 %l7 = tail call i64 @llvm.vscale.i64() 113 %l8 = shl nuw nsw i64 %l7, 4 114 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 115 %l11 = load <vscale x 2 x i64>, ptr %l9, align 16 116 %l13 = mul <vscale x 2 x i64> %l11, %l11 117 %l14 = xor <vscale x 2 x i64> %l13, %l11 118 store <vscale x 2 x i64> %l14, ptr %l9, align 16 119 ret void 120} 121 122define void @scalable_v8i8(ptr noalias nocapture noundef %l0) { 123; CHECK-LABEL: scalable_v8i8: 124; CHECK: // %bb.0: 125; CHECK-NEXT: ptrue p0.h 126; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] 127; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl] 128; CHECK-NEXT: movprfx z2, z0 129; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h 130; CHECK-NEXT: movprfx z3, z1 131; CHECK-NEXT: mul z3.h, p0/m, z3.h, z1.h 132; CHECK-NEXT: eor z0.d, z2.d, z0.d 133; CHECK-NEXT: eor z1.d, z3.d, z1.d 134; CHECK-NEXT: st1b { z0.h }, p0, [x0] 135; CHECK-NEXT: st1b { z1.h }, p0, [x0, #1, mul vl] 136; CHECK-NEXT: ret 137 %l3 = load <vscale x 8 x i8>, ptr %l0, align 16 138 %s3 = sext <vscale x 8 x i8> %l3 to <vscale x 8 x i16> 139 %l5 = mul <vscale x 8 x i16> %s3, %s3 140 %l6 = xor <vscale x 8 x i16> %l5, %s3 141 %t6 = trunc <vscale x 8 x i16> %l6 to <vscale x 8 x i8> 142 store <vscale x 8 x i8> %t6, ptr %l0, align 16 143 %l7 = tail call i64 @llvm.vscale.i64() 144 %l8 = shl nuw nsw i64 %l7, 3 145 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 146 %l11 = load <vscale x 8 x i8>, ptr %l9, align 16 147 %s11 = sext <vscale x 8 x i8> %l11 to <vscale x 8 x i16> 148 %l13 = mul <vscale x 8 x i16> %s11, %s11 149 %l14 = xor <vscale x 8 x i16> %l13, %s11 150 %t14 = trunc <vscale x 8 x i16> %l14 to <vscale x 8 x i8> 151 store <vscale x 8 x i8> %t14, ptr %l9, align 16 152 ret void 153} 154 155define void @scalable_v4i8(ptr noalias nocapture noundef %l0) { 156; CHECK-LABEL: scalable_v4i8: 157; CHECK: // %bb.0: 158; CHECK-NEXT: ptrue p0.s 159; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] 160; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0, #1, mul vl] 161; CHECK-NEXT: movprfx z2, z0 162; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s 163; CHECK-NEXT: movprfx z3, z1 164; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s 165; CHECK-NEXT: eor z0.d, z2.d, z0.d 166; CHECK-NEXT: eor z1.d, z3.d, z1.d 167; CHECK-NEXT: st1b { z0.s }, p0, [x0] 168; CHECK-NEXT: st1b { z1.s }, p0, [x0, #1, mul vl] 169; CHECK-NEXT: ret 170 %l3 = load <vscale x 4 x i8>, ptr %l0, align 16 171 %s3 = sext <vscale x 4 x i8> %l3 to <vscale x 4 x i32> 172 %l5 = mul <vscale x 4 x i32> %s3, %s3 173 %l6 = xor <vscale x 4 x i32> %l5, %s3 174 %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i8> 175 store <vscale x 4 x i8> %t6, ptr %l0, align 16 176 %l7 = tail call i64 @llvm.vscale.i64() 177 %l8 = shl nuw nsw i64 %l7, 2 178 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 179 %l11 = load <vscale x 4 x i8>, ptr %l9, align 16 180 %s11 = sext <vscale x 4 x i8> %l11 to <vscale x 4 x i32> 181 %l13 = mul <vscale x 4 x i32> %s11, %s11 182 %l14 = xor <vscale x 4 x i32> %l13, %s11 183 %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i8> 184 store <vscale x 4 x i8> %t14, ptr %l9, align 16 185 ret void 186} 187 188define void @scalable_v2i8(ptr noalias nocapture noundef %l0) { 189; CHECK-LABEL: scalable_v2i8: 190; CHECK: // %bb.0: 191; CHECK-NEXT: ptrue p0.d 192; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] 193; CHECK-NEXT: ld1sb { z1.d }, p0/z, [x0, #1, mul vl] 194; CHECK-NEXT: movprfx z2, z0 195; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d 196; CHECK-NEXT: movprfx z3, z1 197; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d 198; CHECK-NEXT: eor z0.d, z2.d, z0.d 199; CHECK-NEXT: eor z1.d, z3.d, z1.d 200; CHECK-NEXT: st1b { z0.d }, p0, [x0] 201; CHECK-NEXT: st1b { z1.d }, p0, [x0, #1, mul vl] 202; CHECK-NEXT: ret 203 %l3 = load <vscale x 2 x i8>, ptr %l0, align 16 204 %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64> 205 %l5 = mul <vscale x 2 x i64> %s3, %s3 206 %l6 = xor <vscale x 2 x i64> %l5, %s3 207 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8> 208 store <vscale x 2 x i8> %t6, ptr %l0, align 16 209 %l7 = tail call i64 @llvm.vscale.i64() 210 %l8 = shl nuw nsw i64 %l7, 1 211 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 212 %l11 = load <vscale x 2 x i8>, ptr %l9, align 16 213 %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64> 214 %l13 = mul <vscale x 2 x i64> %s11, %s11 215 %l14 = xor <vscale x 2 x i64> %l13, %s11 216 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8> 217 store <vscale x 2 x i8> %t14, ptr %l9, align 16 218 ret void 219} 220 221define void @scalable_v4i16(ptr noalias nocapture noundef %l0) { 222; CHECK-LABEL: scalable_v4i16: 223; CHECK: // %bb.0: 224; CHECK-NEXT: ptrue p0.s 225; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] 226; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl] 227; CHECK-NEXT: movprfx z2, z0 228; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s 229; CHECK-NEXT: movprfx z3, z1 230; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s 231; CHECK-NEXT: eor z0.d, z2.d, z0.d 232; CHECK-NEXT: eor z1.d, z3.d, z1.d 233; CHECK-NEXT: st1h { z0.s }, p0, [x0] 234; CHECK-NEXT: st1h { z1.s }, p0, [x0, #1, mul vl] 235; CHECK-NEXT: ret 236 %l3 = load <vscale x 4 x i16>, ptr %l0, align 16 237 %s3 = sext <vscale x 4 x i16> %l3 to <vscale x 4 x i32> 238 %l5 = mul <vscale x 4 x i32> %s3, %s3 239 %l6 = xor <vscale x 4 x i32> %l5, %s3 240 %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i16> 241 store <vscale x 4 x i16> %t6, ptr %l0, align 16 242 %l7 = tail call i64 @llvm.vscale.i64() 243 %l8 = shl nuw nsw i64 %l7, 3 244 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 245 %l11 = load <vscale x 4 x i16>, ptr %l9, align 16 246 %s11 = sext <vscale x 4 x i16> %l11 to <vscale x 4 x i32> 247 %l13 = mul <vscale x 4 x i32> %s11, %s11 248 %l14 = xor <vscale x 4 x i32> %l13, %s11 249 %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i16> 250 store <vscale x 4 x i16> %t14, ptr %l9, align 16 251 ret void 252} 253 254define void @scalable_v2i16(ptr noalias nocapture noundef %l0) { 255; CHECK-LABEL: scalable_v2i16: 256; CHECK: // %bb.0: 257; CHECK-NEXT: ptrue p0.d 258; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] 259; CHECK-NEXT: ld1sh { z1.d }, p0/z, [x0, #1, mul vl] 260; CHECK-NEXT: movprfx z2, z0 261; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d 262; CHECK-NEXT: movprfx z3, z1 263; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d 264; CHECK-NEXT: eor z0.d, z2.d, z0.d 265; CHECK-NEXT: eor z1.d, z3.d, z1.d 266; CHECK-NEXT: st1h { z0.d }, p0, [x0] 267; CHECK-NEXT: st1h { z1.d }, p0, [x0, #1, mul vl] 268; CHECK-NEXT: ret 269 %l3 = load <vscale x 2 x i16>, ptr %l0, align 16 270 %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64> 271 %l5 = mul <vscale x 2 x i64> %s3, %s3 272 %l6 = xor <vscale x 2 x i64> %l5, %s3 273 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16> 274 store <vscale x 2 x i16> %t6, ptr %l0, align 16 275 %l7 = tail call i64 @llvm.vscale.i64() 276 %l8 = shl nuw nsw i64 %l7, 2 277 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 278 %l11 = load <vscale x 2 x i16>, ptr %l9, align 16 279 %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64> 280 %l13 = mul <vscale x 2 x i64> %s11, %s11 281 %l14 = xor <vscale x 2 x i64> %l13, %s11 282 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16> 283 store <vscale x 2 x i16> %t14, ptr %l9, align 16 284 ret void 285} 286 287define void @scalable_v2i32(ptr noalias nocapture noundef %l0) { 288; CHECK-LABEL: scalable_v2i32: 289; CHECK: // %bb.0: 290; CHECK-NEXT: ptrue p0.d 291; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] 292; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl] 293; CHECK-NEXT: movprfx z2, z0 294; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d 295; CHECK-NEXT: movprfx z3, z1 296; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d 297; CHECK-NEXT: eor z0.d, z2.d, z0.d 298; CHECK-NEXT: eor z1.d, z3.d, z1.d 299; CHECK-NEXT: st1w { z0.d }, p0, [x0] 300; CHECK-NEXT: st1w { z1.d }, p0, [x0, #1, mul vl] 301; CHECK-NEXT: ret 302 %l3 = load <vscale x 2 x i32>, ptr %l0, align 16 303 %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64> 304 %l5 = mul <vscale x 2 x i64> %s3, %s3 305 %l6 = xor <vscale x 2 x i64> %l5, %s3 306 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32> 307 store <vscale x 2 x i32> %t6, ptr %l0, align 16 308 %l7 = tail call i64 @llvm.vscale.i64() 309 %l8 = shl nuw nsw i64 %l7, 3 310 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 311 %l11 = load <vscale x 2 x i32>, ptr %l9, align 16 312 %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64> 313 %l13 = mul <vscale x 2 x i64> %s11, %s11 314 %l14 = xor <vscale x 2 x i64> %l13, %s11 315 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32> 316 store <vscale x 2 x i32> %t14, ptr %l9, align 16 317 ret void 318} 319 320define void @negative_tooshort_v16i8(ptr noalias nocapture noundef %l0) { 321; CHECK-LABEL: negative_tooshort_v16i8: 322; CHECK: // %bb.0: 323; CHECK-NEXT: ptrue p0.b 324; CHECK-NEXT: cnth x8 325; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 326; CHECK-NEXT: movprfx z1, z0 327; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 328; CHECK-NEXT: eor z0.d, z1.d, z0.d 329; CHECK-NEXT: st1b { z0.b }, p0, [x0] 330; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 331; CHECK-NEXT: movprfx z1, z0 332; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 333; CHECK-NEXT: eor z0.d, z1.d, z0.d 334; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] 335; CHECK-NEXT: ret 336 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16 337 %l5 = mul <vscale x 16 x i8> %l3, %l3 338 %l6 = xor <vscale x 16 x i8> %l5, %l3 339 store <vscale x 16 x i8> %l6, ptr %l0, align 16 340 %l7 = tail call i64 @llvm.vscale.i64() 341 %l8 = shl nuw nsw i64 %l7, 3 342 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 343 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16 344 %l13 = mul <vscale x 16 x i8> %l11, %l11 345 %l14 = xor <vscale x 16 x i8> %l13, %l11 346 store <vscale x 16 x i8> %l14, ptr %l9, align 16 347 ret void 348} 349 350define void @negative_scalable_v2i8(ptr noalias nocapture noundef %l0) { 351; CHECK-LABEL: negative_scalable_v2i8: 352; CHECK: // %bb.0: 353; CHECK-NEXT: ptrue p0.d 354; CHECK-NEXT: rdvl x8, #1 355; CHECK-NEXT: lsr x8, x8, #4 356; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] 357; CHECK-NEXT: movprfx z1, z0 358; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 359; CHECK-NEXT: eor z0.d, z1.d, z0.d 360; CHECK-NEXT: st1b { z0.d }, p0, [x0] 361; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x8] 362; CHECK-NEXT: movprfx z1, z0 363; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 364; CHECK-NEXT: eor z0.d, z1.d, z0.d 365; CHECK-NEXT: st1b { z0.d }, p0, [x0, x8] 366; CHECK-NEXT: ret 367 %l3 = load <vscale x 2 x i8>, ptr %l0, align 16 368 %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64> 369 %l5 = mul <vscale x 2 x i64> %s3, %s3 370 %l6 = xor <vscale x 2 x i64> %l5, %s3 371 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8> 372 store <vscale x 2 x i8> %t6, ptr %l0, align 16 373 %l7 = tail call i64 @llvm.vscale.i64() 374 %l8 = shl nuw nsw i64 %l7, 0 375 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 376 %l11 = load <vscale x 2 x i8>, ptr %l9, align 16 377 %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64> 378 %l13 = mul <vscale x 2 x i64> %s11, %s11 379 %l14 = xor <vscale x 2 x i64> %l13, %s11 380 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8> 381 store <vscale x 2 x i8> %t14, ptr %l9, align 16 382 ret void 383} 384 385define void @negative_scalable_v2i16(ptr noalias nocapture noundef %l0) { 386; CHECK-LABEL: negative_scalable_v2i16: 387; CHECK: // %bb.0: 388; CHECK-NEXT: ptrue p0.d 389; CHECK-NEXT: cntd x8 390; CHECK-NEXT: add x8, x0, x8 391; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] 392; CHECK-NEXT: movprfx z1, z0 393; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 394; CHECK-NEXT: eor z0.d, z1.d, z0.d 395; CHECK-NEXT: st1h { z0.d }, p0, [x0] 396; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x8] 397; CHECK-NEXT: movprfx z1, z0 398; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 399; CHECK-NEXT: eor z0.d, z1.d, z0.d 400; CHECK-NEXT: st1h { z0.d }, p0, [x8] 401; CHECK-NEXT: ret 402 %l3 = load <vscale x 2 x i16>, ptr %l0, align 16 403 %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64> 404 %l5 = mul <vscale x 2 x i64> %s3, %s3 405 %l6 = xor <vscale x 2 x i64> %l5, %s3 406 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16> 407 store <vscale x 2 x i16> %t6, ptr %l0, align 16 408 %l7 = tail call i64 @llvm.vscale.i64() 409 %l8 = shl nuw nsw i64 %l7, 1 410 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 411 %l11 = load <vscale x 2 x i16>, ptr %l9, align 16 412 %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64> 413 %l13 = mul <vscale x 2 x i64> %s11, %s11 414 %l14 = xor <vscale x 2 x i64> %l13, %s11 415 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16> 416 store <vscale x 2 x i16> %t14, ptr %l9, align 16 417 ret void 418} 419 420define void @negative_scalable_v2i32(ptr noalias nocapture noundef %l0) { 421; CHECK-LABEL: negative_scalable_v2i32: 422; CHECK: // %bb.0: 423; CHECK-NEXT: ptrue p0.d 424; CHECK-NEXT: cntw x8 425; CHECK-NEXT: add x8, x0, x8 426; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] 427; CHECK-NEXT: movprfx z1, z0 428; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 429; CHECK-NEXT: eor z0.d, z1.d, z0.d 430; CHECK-NEXT: st1w { z0.d }, p0, [x0] 431; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x8] 432; CHECK-NEXT: movprfx z1, z0 433; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d 434; CHECK-NEXT: eor z0.d, z1.d, z0.d 435; CHECK-NEXT: st1w { z0.d }, p0, [x8] 436; CHECK-NEXT: ret 437 %l3 = load <vscale x 2 x i32>, ptr %l0, align 16 438 %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64> 439 %l5 = mul <vscale x 2 x i64> %s3, %s3 440 %l6 = xor <vscale x 2 x i64> %l5, %s3 441 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32> 442 store <vscale x 2 x i32> %t6, ptr %l0, align 16 443 %l7 = tail call i64 @llvm.vscale.i64() 444 %l8 = shl nuw nsw i64 %l7, 2 445 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 446 %l11 = load <vscale x 2 x i32>, ptr %l9, align 16 447 %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64> 448 %l13 = mul <vscale x 2 x i64> %s11, %s11 449 %l14 = xor <vscale x 2 x i64> %l13, %s11 450 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32> 451 store <vscale x 2 x i32> %t14, ptr %l9, align 16 452 ret void 453} 454 455define void @triple_v16i8(ptr noalias nocapture noundef %l0) { 456; CHECK-LABEL: triple_v16i8: 457; CHECK: // %bb.0: 458; CHECK-NEXT: ptrue p0.b 459; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 460; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl] 461; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl] 462; CHECK-NEXT: movprfx z3, z0 463; CHECK-NEXT: mul z3.b, p0/m, z3.b, z0.b 464; CHECK-NEXT: movprfx z4, z1 465; CHECK-NEXT: mul z4.b, p0/m, z4.b, z1.b 466; CHECK-NEXT: movprfx z5, z2 467; CHECK-NEXT: mul z5.b, p0/m, z5.b, z2.b 468; CHECK-NEXT: eor z0.d, z3.d, z0.d 469; CHECK-NEXT: eor z1.d, z4.d, z1.d 470; CHECK-NEXT: eor z2.d, z5.d, z2.d 471; CHECK-NEXT: st1b { z0.b }, p0, [x0] 472; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl] 473; CHECK-NEXT: st1b { z2.b }, p0, [x0, #2, mul vl] 474; CHECK-NEXT: ret 475 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16 476 %l5 = mul <vscale x 16 x i8> %l3, %l3 477 %l6 = xor <vscale x 16 x i8> %l5, %l3 478 store <vscale x 16 x i8> %l6, ptr %l0, align 16 479 %l7 = tail call i64 @llvm.vscale.i64() 480 %l8 = shl nuw nsw i64 %l7, 4 481 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 482 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16 483 %l13 = mul <vscale x 16 x i8> %l11, %l11 484 %l14 = xor <vscale x 16 x i8> %l13, %l11 485 store <vscale x 16 x i8> %l14, ptr %l9, align 16 486 %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8 487 %m11 = load <vscale x 16 x i8>, ptr %m9, align 16 488 %m13 = mul <vscale x 16 x i8> %m11, %m11 489 %m14 = xor <vscale x 16 x i8> %m13, %m11 490 store <vscale x 16 x i8> %m14, ptr %m9, align 16 491 ret void 492} 493 494define void @negative_tripletooshort_v16i8(ptr noalias nocapture noundef %l0) { 495; CHECK-LABEL: negative_tripletooshort_v16i8: 496; CHECK: // %bb.0: 497; CHECK-NEXT: ptrue p0.b 498; CHECK-NEXT: cntw x8 499; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 500; CHECK-NEXT: movprfx z1, z0 501; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 502; CHECK-NEXT: eor z0.d, z1.d, z0.d 503; CHECK-NEXT: st1b { z0.b }, p0, [x0] 504; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 505; CHECK-NEXT: movprfx z1, z0 506; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 507; CHECK-NEXT: eor z0.d, z1.d, z0.d 508; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] 509; CHECK-NEXT: cnth x8 510; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 511; CHECK-NEXT: movprfx z1, z0 512; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b 513; CHECK-NEXT: eor z0.d, z1.d, z0.d 514; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8] 515; CHECK-NEXT: ret 516 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16 517 %l5 = mul <vscale x 16 x i8> %l3, %l3 518 %l6 = xor <vscale x 16 x i8> %l5, %l3 519 store <vscale x 16 x i8> %l6, ptr %l0, align 16 520 %l7 = tail call i64 @llvm.vscale.i64() 521 %l8 = shl nuw nsw i64 %l7, 2 522 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8 523 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16 524 %l13 = mul <vscale x 16 x i8> %l11, %l11 525 %l14 = xor <vscale x 16 x i8> %l13, %l11 526 store <vscale x 16 x i8> %l14, ptr %l9, align 16 527 %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8 528 %m11 = load <vscale x 16 x i8>, ptr %m9, align 16 529 %m13 = mul <vscale x 16 x i8> %m11, %m11 530 %m14 = xor <vscale x 16 x i8> %m13, %m11 531 store <vscale x 16 x i8> %m14, ptr %m9, align 16 532 ret void 533} 534 535declare i64 @llvm.vscale.i64() 536