1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; sext i1 -> i32 10; 11 12; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg 13; type's element type is not byte based and thus cannot be lowered directly to 14; an SVE instruction. 15define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) vscale_range(2,0) #0 { 16; CHECK-LABEL: sext_v8i1_v8i32: 17; CHECK: // %bb.0: 18; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 19; CHECK-NEXT: ptrue p0.s, vl8 20; CHECK-NEXT: uunpklo z0.h, z0.b 21; CHECK-NEXT: uunpklo z0.s, z0.h 22; CHECK-NEXT: lsl z0.s, z0.s, #31 23; CHECK-NEXT: asr z0.s, z0.s, #31 24; CHECK-NEXT: st1w { z0.s }, p0, [x0] 25; CHECK-NEXT: ret 26 %b = sext <8 x i1> %a to <8 x i32> 27 store <8 x i32> %b, ptr %out 28 ret void 29} 30 31; 32; sext i3 -> i64 33; 34 35; NOTE: Covers the scenario where a SIGN_EXTEND_INREG is required, whose inreg 36; type's element type is not power-of-2 based and thus cannot be lowered 37; directly to an SVE instruction. 38define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 { 39; CHECK-LABEL: sext_v4i3_v4i64: 40; CHECK: // %bb.0: 41; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 42; CHECK-NEXT: ptrue p0.d, vl4 43; CHECK-NEXT: uunpklo z0.s, z0.h 44; CHECK-NEXT: uunpklo z0.d, z0.s 45; CHECK-NEXT: lsl z0.d, z0.d, #61 46; CHECK-NEXT: asr z0.d, z0.d, #61 47; CHECK-NEXT: st1d { z0.d }, p0, [x0] 48; CHECK-NEXT: ret 49 %b = sext <4 x i3> %a to <4 x i64> 50 store <4 x i64> %b, ptr %out 51 ret void 52} 53 54; 55; sext i8 -> i16 56; 57 58define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { 59; CHECK-LABEL: sext_v16i8_v16i16: 60; CHECK: // %bb.0: 61; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 62; CHECK-NEXT: ptrue p0.h, vl16 63; CHECK-NEXT: sunpklo z0.h, z0.b 64; CHECK-NEXT: st1h { z0.h }, p0, [x0] 65; CHECK-NEXT: ret 66 %b = sext <16 x i8> %a to <16 x i16> 67 store <16 x i16>%b, ptr %out 68 ret void 69} 70 71; NOTE: Extra 'add' is to prevent the extend being combined with the load. 72define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 { 73; VBITS_GE_256-LABEL: sext_v32i8_v32i16: 74; VBITS_GE_256: // %bb.0: 75; VBITS_GE_256-NEXT: ptrue p0.b, vl32 76; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 77; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 78; VBITS_GE_256-NEXT: ptrue p0.h, vl16 79; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b 80; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b 81; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 82; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 83; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] 84; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] 85; VBITS_GE_256-NEXT: ret 86; 87; VBITS_GE_512-LABEL: sext_v32i8_v32i16: 88; VBITS_GE_512: // %bb.0: 89; VBITS_GE_512-NEXT: ptrue p0.b, vl32 90; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 91; VBITS_GE_512-NEXT: ptrue p0.h, vl32 92; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b 93; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 94; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 95; VBITS_GE_512-NEXT: ret 96 %a = load <32 x i8>, ptr %in 97 %b = add <32 x i8> %a, %a 98 %c = sext <32 x i8> %b to <32 x i16> 99 store <32 x i16> %c, ptr %out 100 ret void 101} 102 103define void @sext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 { 104; CHECK-LABEL: sext_v64i8_v64i16: 105; CHECK: // %bb.0: 106; CHECK-NEXT: ptrue p0.b, vl64 107; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 108; CHECK-NEXT: ptrue p0.h, vl64 109; CHECK-NEXT: add z0.b, z0.b, z0.b 110; CHECK-NEXT: sunpklo z0.h, z0.b 111; CHECK-NEXT: st1h { z0.h }, p0, [x1] 112; CHECK-NEXT: ret 113 %a = load <64 x i8>, ptr %in 114 %b = add <64 x i8> %a, %a 115 %c = sext <64 x i8> %b to <64 x i16> 116 store <64 x i16> %c, ptr %out 117 ret void 118} 119 120define void @sext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 { 121; CHECK-LABEL: sext_v128i8_v128i16: 122; CHECK: // %bb.0: 123; CHECK-NEXT: ptrue p0.b, vl128 124; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 125; CHECK-NEXT: ptrue p0.h, vl128 126; CHECK-NEXT: add z0.b, z0.b, z0.b 127; CHECK-NEXT: sunpklo z0.h, z0.b 128; CHECK-NEXT: st1h { z0.h }, p0, [x1] 129; CHECK-NEXT: ret 130 %a = load <128 x i8>, ptr %in 131 %b = add <128 x i8> %a, %a 132 %c = sext <128 x i8> %b to <128 x i16> 133 store <128 x i16> %c, ptr %out 134 ret void 135} 136 137; 138; sext i8 -> i32 139; 140 141define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 { 142; CHECK-LABEL: sext_v8i8_v8i32: 143; CHECK: // %bb.0: 144; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 145; CHECK-NEXT: ptrue p0.s, vl8 146; CHECK-NEXT: sunpklo z0.h, z0.b 147; CHECK-NEXT: sunpklo z0.s, z0.h 148; CHECK-NEXT: st1w { z0.s }, p0, [x0] 149; CHECK-NEXT: ret 150 %b = sext <8 x i8> %a to <8 x i32> 151 store <8 x i32>%b, ptr %out 152 ret void 153} 154 155define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 { 156; VBITS_GE_256-LABEL: sext_v16i8_v16i32: 157; VBITS_GE_256: // %bb.0: 158; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 159; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 160; VBITS_GE_256-NEXT: ptrue p0.s, vl8 161; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 162; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 163; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b 164; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 165; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 166; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] 167; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] 168; VBITS_GE_256-NEXT: ret 169; 170; VBITS_GE_512-LABEL: sext_v16i8_v16i32: 171; VBITS_GE_512: // %bb.0: 172; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 173; VBITS_GE_512-NEXT: ptrue p0.s, vl16 174; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 175; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 176; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 177; VBITS_GE_512-NEXT: ret 178 %b = sext <16 x i8> %a to <16 x i32> 179 store <16 x i32> %b, ptr %out 180 ret void 181} 182 183define void @sext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 184; CHECK-LABEL: sext_v32i8_v32i32: 185; CHECK: // %bb.0: 186; CHECK-NEXT: ptrue p0.b, vl32 187; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 188; CHECK-NEXT: ptrue p0.s, vl32 189; CHECK-NEXT: add z0.b, z0.b, z0.b 190; CHECK-NEXT: sunpklo z0.h, z0.b 191; CHECK-NEXT: sunpklo z0.s, z0.h 192; CHECK-NEXT: st1w { z0.s }, p0, [x1] 193; CHECK-NEXT: ret 194 %a = load <32 x i8>, ptr %in 195 %b = add <32 x i8> %a, %a 196 %c = sext <32 x i8> %b to <32 x i32> 197 store <32 x i32> %c, ptr %out 198 ret void 199} 200 201define void @sext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 202; CHECK-LABEL: sext_v64i8_v64i32: 203; CHECK: // %bb.0: 204; CHECK-NEXT: ptrue p0.b, vl64 205; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 206; CHECK-NEXT: ptrue p0.s, vl64 207; CHECK-NEXT: add z0.b, z0.b, z0.b 208; CHECK-NEXT: sunpklo z0.h, z0.b 209; CHECK-NEXT: sunpklo z0.s, z0.h 210; CHECK-NEXT: st1w { z0.s }, p0, [x1] 211; CHECK-NEXT: ret 212 %a = load <64 x i8>, ptr %in 213 %b = add <64 x i8> %a, %a 214 %c = sext <64 x i8> %b to <64 x i32> 215 store <64 x i32> %c, ptr %out 216 ret void 217} 218 219; 220; sext i8 -> i64 221; 222 223; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The sign 224; extend is a two step process where the container is any_extend'd with the 225; result feeding an inreg sign extend. 226define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 { 227; CHECK-LABEL: sext_v4i8_v4i64: 228; CHECK: // %bb.0: 229; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 230; CHECK-NEXT: ptrue p0.d, vl4 231; CHECK-NEXT: uunpklo z0.s, z0.h 232; CHECK-NEXT: uunpklo z0.d, z0.s 233; CHECK-NEXT: sxtb z0.d, p0/m, z0.d 234; CHECK-NEXT: st1d { z0.d }, p0, [x0] 235; CHECK-NEXT: ret 236 %b = sext <4 x i8> %a to <4 x i64> 237 store <4 x i64>%b, ptr %out 238 ret void 239} 240 241define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 { 242; VBITS_GE_256-LABEL: sext_v8i8_v8i64: 243; VBITS_GE_256: // %bb.0: 244; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 245; VBITS_GE_256-NEXT: ptrue p0.d, vl4 246; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 247; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 248; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 249; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 250; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 251; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 252; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] 253; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] 254; VBITS_GE_256-NEXT: ret 255; 256; VBITS_GE_512-LABEL: sext_v8i8_v8i64: 257; VBITS_GE_512: // %bb.0: 258; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 259; VBITS_GE_512-NEXT: ptrue p0.d, vl8 260; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 261; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 262; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 263; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 264; VBITS_GE_512-NEXT: ret 265 %b = sext <8 x i8> %a to <8 x i64> 266 store <8 x i64>%b, ptr %out 267 ret void 268} 269 270define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 { 271; CHECK-LABEL: sext_v16i8_v16i64: 272; CHECK: // %bb.0: 273; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 274; CHECK-NEXT: ptrue p0.d, vl16 275; CHECK-NEXT: sunpklo z0.h, z0.b 276; CHECK-NEXT: sunpklo z0.s, z0.h 277; CHECK-NEXT: sunpklo z0.d, z0.s 278; CHECK-NEXT: st1d { z0.d }, p0, [x0] 279; CHECK-NEXT: ret 280 %b = sext <16 x i8> %a to <16 x i64> 281 store <16 x i64> %b, ptr %out 282 ret void 283} 284 285define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 286; CHECK-LABEL: sext_v32i8_v32i64: 287; CHECK: // %bb.0: 288; CHECK-NEXT: ptrue p0.b, vl32 289; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 290; CHECK-NEXT: ptrue p0.d, vl32 291; CHECK-NEXT: add z0.b, z0.b, z0.b 292; CHECK-NEXT: sunpklo z0.h, z0.b 293; CHECK-NEXT: sunpklo z0.s, z0.h 294; CHECK-NEXT: sunpklo z0.d, z0.s 295; CHECK-NEXT: st1d { z0.d }, p0, [x1] 296; CHECK-NEXT: ret 297 %a = load <32 x i8>, ptr %in 298 %b = add <32 x i8> %a, %a 299 %c = sext <32 x i8> %b to <32 x i64> 300 store <32 x i64> %c, ptr %out 301 ret void 302} 303 304; 305; sext i16 -> i32 306; 307 308define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { 309; CHECK-LABEL: sext_v8i16_v8i32: 310; CHECK: // %bb.0: 311; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 312; CHECK-NEXT: ptrue p0.s, vl8 313; CHECK-NEXT: sunpklo z0.s, z0.h 314; CHECK-NEXT: st1w { z0.s }, p0, [x0] 315; CHECK-NEXT: ret 316 %b = sext <8 x i16> %a to <8 x i32> 317 store <8 x i32>%b, ptr %out 318 ret void 319} 320 321define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 { 322; VBITS_GE_256-LABEL: sext_v16i16_v16i32: 323; VBITS_GE_256: // %bb.0: 324; VBITS_GE_256-NEXT: ptrue p0.h, vl16 325; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 326; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 327; VBITS_GE_256-NEXT: ptrue p0.s, vl8 328; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h 329; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h 330; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 331; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 332; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] 333; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] 334; VBITS_GE_256-NEXT: ret 335; 336; VBITS_GE_512-LABEL: sext_v16i16_v16i32: 337; VBITS_GE_512: // %bb.0: 338; VBITS_GE_512-NEXT: ptrue p0.h, vl16 339; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 340; VBITS_GE_512-NEXT: ptrue p0.s, vl16 341; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h 342; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 343; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 344; VBITS_GE_512-NEXT: ret 345 %a = load <16 x i16>, ptr %in 346 %b = add <16 x i16> %a, %a 347 %c = sext <16 x i16> %b to <16 x i32> 348 store <16 x i32> %c, ptr %out 349 ret void 350} 351 352define void @sext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 353; CHECK-LABEL: sext_v32i16_v32i32: 354; CHECK: // %bb.0: 355; CHECK-NEXT: ptrue p0.h, vl32 356; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 357; CHECK-NEXT: ptrue p0.s, vl32 358; CHECK-NEXT: add z0.h, z0.h, z0.h 359; CHECK-NEXT: sunpklo z0.s, z0.h 360; CHECK-NEXT: st1w { z0.s }, p0, [x1] 361; CHECK-NEXT: ret 362 %a = load <32 x i16>, ptr %in 363 %b = add <32 x i16> %a, %a 364 %c = sext <32 x i16> %b to <32 x i32> 365 store <32 x i32> %c, ptr %out 366 ret void 367} 368 369define void @sext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 370; CHECK-LABEL: sext_v64i16_v64i32: 371; CHECK: // %bb.0: 372; CHECK-NEXT: ptrue p0.h, vl64 373; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 374; CHECK-NEXT: ptrue p0.s, vl64 375; CHECK-NEXT: add z0.h, z0.h, z0.h 376; CHECK-NEXT: sunpklo z0.s, z0.h 377; CHECK-NEXT: st1w { z0.s }, p0, [x1] 378; CHECK-NEXT: ret 379 %a = load <64 x i16>, ptr %in 380 %b = add <64 x i16> %a, %a 381 %c = sext <64 x i16> %b to <64 x i32> 382 store <64 x i32> %c, ptr %out 383 ret void 384} 385 386; 387; sext i16 -> i64 388; 389 390define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 { 391; CHECK-LABEL: sext_v4i16_v4i64: 392; CHECK: // %bb.0: 393; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 394; CHECK-NEXT: ptrue p0.d, vl4 395; CHECK-NEXT: sunpklo z0.s, z0.h 396; CHECK-NEXT: sunpklo z0.d, z0.s 397; CHECK-NEXT: st1d { z0.d }, p0, [x0] 398; CHECK-NEXT: ret 399 %b = sext <4 x i16> %a to <4 x i64> 400 store <4 x i64>%b, ptr %out 401 ret void 402} 403 404define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 { 405; VBITS_GE_256-LABEL: sext_v8i16_v8i64: 406; VBITS_GE_256: // %bb.0: 407; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 408; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 409; VBITS_GE_256-NEXT: ptrue p0.d, vl4 410; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 411; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 412; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 413; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 414; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 415; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] 416; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] 417; VBITS_GE_256-NEXT: ret 418; 419; VBITS_GE_512-LABEL: sext_v8i16_v8i64: 420; VBITS_GE_512: // %bb.0: 421; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 422; VBITS_GE_512-NEXT: ptrue p0.d, vl8 423; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 424; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 425; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 426; VBITS_GE_512-NEXT: ret 427 %b = sext <8 x i16> %a to <8 x i64> 428 store <8 x i64>%b, ptr %out 429 ret void 430} 431 432define void @sext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { 433; CHECK-LABEL: sext_v16i16_v16i64: 434; CHECK: // %bb.0: 435; CHECK-NEXT: ptrue p0.h, vl16 436; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 437; CHECK-NEXT: ptrue p0.d, vl16 438; CHECK-NEXT: add z0.h, z0.h, z0.h 439; CHECK-NEXT: sunpklo z0.s, z0.h 440; CHECK-NEXT: sunpklo z0.d, z0.s 441; CHECK-NEXT: st1d { z0.d }, p0, [x1] 442; CHECK-NEXT: ret 443 %a = load <16 x i16>, ptr %in 444 %b = add <16 x i16> %a, %a 445 %c = sext <16 x i16> %b to <16 x i64> 446 store <16 x i64> %c, ptr %out 447 ret void 448} 449 450define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 451; CHECK-LABEL: sext_v32i16_v32i64: 452; CHECK: // %bb.0: 453; CHECK-NEXT: ptrue p0.h, vl32 454; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 455; CHECK-NEXT: ptrue p0.d, vl32 456; CHECK-NEXT: add z0.h, z0.h, z0.h 457; CHECK-NEXT: sunpklo z0.s, z0.h 458; CHECK-NEXT: sunpklo z0.d, z0.s 459; CHECK-NEXT: st1d { z0.d }, p0, [x1] 460; CHECK-NEXT: ret 461 %a = load <32 x i16>, ptr %in 462 %b = add <32 x i16> %a, %a 463 %c = sext <32 x i16> %b to <32 x i64> 464 store <32 x i64> %c, ptr %out 465 ret void 466} 467 468; 469; sext i32 -> i64 470; 471 472define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { 473; CHECK-LABEL: sext_v4i32_v4i64: 474; CHECK: // %bb.0: 475; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 476; CHECK-NEXT: ptrue p0.d, vl4 477; CHECK-NEXT: sunpklo z0.d, z0.s 478; CHECK-NEXT: st1d { z0.d }, p0, [x0] 479; CHECK-NEXT: ret 480 %b = sext <4 x i32> %a to <4 x i64> 481 store <4 x i64>%b, ptr %out 482 ret void 483} 484 485define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 { 486; VBITS_GE_256-LABEL: sext_v8i32_v8i64: 487; VBITS_GE_256: // %bb.0: 488; VBITS_GE_256-NEXT: ptrue p0.s, vl8 489; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 490; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 491; VBITS_GE_256-NEXT: ptrue p0.d, vl4 492; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s 493; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s 494; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 495; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 496; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 497; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 498; VBITS_GE_256-NEXT: ret 499; 500; VBITS_GE_512-LABEL: sext_v8i32_v8i64: 501; VBITS_GE_512: // %bb.0: 502; VBITS_GE_512-NEXT: ptrue p0.s, vl8 503; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 504; VBITS_GE_512-NEXT: ptrue p0.d, vl8 505; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s 506; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 507; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 508; VBITS_GE_512-NEXT: ret 509 %a = load <8 x i32>, ptr %in 510 %b = add <8 x i32> %a, %a 511 %c = sext <8 x i32> %b to <8 x i64> 512 store <8 x i64> %c, ptr %out 513 ret void 514} 515 516define void @sext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { 517; CHECK-LABEL: sext_v16i32_v16i64: 518; CHECK: // %bb.0: 519; CHECK-NEXT: ptrue p0.s, vl16 520; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 521; CHECK-NEXT: ptrue p0.d, vl16 522; CHECK-NEXT: add z0.s, z0.s, z0.s 523; CHECK-NEXT: sunpklo z0.d, z0.s 524; CHECK-NEXT: st1d { z0.d }, p0, [x1] 525; CHECK-NEXT: ret 526 %a = load <16 x i32>, ptr %in 527 %b = add <16 x i32> %a, %a 528 %c = sext <16 x i32> %b to <16 x i64> 529 store <16 x i64> %c, ptr %out 530 ret void 531} 532 533define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 534; CHECK-LABEL: sext_v32i32_v32i64: 535; CHECK: // %bb.0: 536; CHECK-NEXT: ptrue p0.s, vl32 537; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 538; CHECK-NEXT: ptrue p0.d, vl32 539; CHECK-NEXT: add z0.s, z0.s, z0.s 540; CHECK-NEXT: sunpklo z0.d, z0.s 541; CHECK-NEXT: st1d { z0.d }, p0, [x1] 542; CHECK-NEXT: ret 543 %a = load <32 x i32>, ptr %in 544 %b = add <32 x i32> %a, %a 545 %c = sext <32 x i32> %b to <32 x i64> 546 store <32 x i64> %c, ptr %out 547 ret void 548} 549 550; 551; zext i8 -> i16 552; 553 554define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { 555; CHECK-LABEL: zext_v16i8_v16i16: 556; CHECK: // %bb.0: 557; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 558; CHECK-NEXT: ptrue p0.h, vl16 559; CHECK-NEXT: uunpklo z0.h, z0.b 560; CHECK-NEXT: st1h { z0.h }, p0, [x0] 561; CHECK-NEXT: ret 562 %b = zext <16 x i8> %a to <16 x i16> 563 store <16 x i16>%b, ptr %out 564 ret void 565} 566 567; NOTE: Extra 'add' is to prevent the extend being combined with the load. 568define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 { 569; VBITS_GE_256-LABEL: zext_v32i8_v32i16: 570; VBITS_GE_256: // %bb.0: 571; VBITS_GE_256-NEXT: ptrue p0.b, vl32 572; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 573; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 574; VBITS_GE_256-NEXT: ptrue p0.h, vl16 575; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b 576; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b 577; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 578; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 579; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] 580; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] 581; VBITS_GE_256-NEXT: ret 582; 583; VBITS_GE_512-LABEL: zext_v32i8_v32i16: 584; VBITS_GE_512: // %bb.0: 585; VBITS_GE_512-NEXT: ptrue p0.b, vl32 586; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 587; VBITS_GE_512-NEXT: ptrue p0.h, vl32 588; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b 589; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 590; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 591; VBITS_GE_512-NEXT: ret 592 %a = load <32 x i8>, ptr %in 593 %b = add <32 x i8> %a, %a 594 %c = zext <32 x i8> %b to <32 x i16> 595 store <32 x i16> %c, ptr %out 596 ret void 597} 598 599define void @zext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 { 600; CHECK-LABEL: zext_v64i8_v64i16: 601; CHECK: // %bb.0: 602; CHECK-NEXT: ptrue p0.b, vl64 603; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 604; CHECK-NEXT: ptrue p0.h, vl64 605; CHECK-NEXT: add z0.b, z0.b, z0.b 606; CHECK-NEXT: uunpklo z0.h, z0.b 607; CHECK-NEXT: st1h { z0.h }, p0, [x1] 608; CHECK-NEXT: ret 609 %a = load <64 x i8>, ptr %in 610 %b = add <64 x i8> %a, %a 611 %c = zext <64 x i8> %b to <64 x i16> 612 store <64 x i16> %c, ptr %out 613 ret void 614} 615 616define void @zext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 { 617; CHECK-LABEL: zext_v128i8_v128i16: 618; CHECK: // %bb.0: 619; CHECK-NEXT: ptrue p0.b, vl128 620; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 621; CHECK-NEXT: ptrue p0.h, vl128 622; CHECK-NEXT: add z0.b, z0.b, z0.b 623; CHECK-NEXT: uunpklo z0.h, z0.b 624; CHECK-NEXT: st1h { z0.h }, p0, [x1] 625; CHECK-NEXT: ret 626 %a = load <128 x i8>, ptr %in 627 %b = add <128 x i8> %a, %a 628 %c = zext <128 x i8> %b to <128 x i16> 629 store <128 x i16> %c, ptr %out 630 ret void 631} 632 633; 634; zext i8 -> i32 635; 636 637define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) vscale_range(2,0) #0 { 638; CHECK-LABEL: zext_v8i8_v8i32: 639; CHECK: // %bb.0: 640; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 641; CHECK-NEXT: ptrue p0.s, vl8 642; CHECK-NEXT: uunpklo z0.h, z0.b 643; CHECK-NEXT: uunpklo z0.s, z0.h 644; CHECK-NEXT: st1w { z0.s }, p0, [x0] 645; CHECK-NEXT: ret 646 %b = zext <8 x i8> %a to <8 x i32> 647 store <8 x i32>%b, ptr %out 648 ret void 649} 650 651define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) #0 { 652; VBITS_GE_256-LABEL: zext_v16i8_v16i32: 653; VBITS_GE_256: // %bb.0: 654; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 655; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 656; VBITS_GE_256-NEXT: ptrue p0.s, vl8 657; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 658; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 659; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b 660; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 661; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 662; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] 663; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] 664; VBITS_GE_256-NEXT: ret 665; 666; VBITS_GE_512-LABEL: zext_v16i8_v16i32: 667; VBITS_GE_512: // %bb.0: 668; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 669; VBITS_GE_512-NEXT: ptrue p0.s, vl16 670; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 671; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 672; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 673; VBITS_GE_512-NEXT: ret 674 %b = zext <16 x i8> %a to <16 x i32> 675 store <16 x i32> %b, ptr %out 676 ret void 677} 678 679define void @zext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 680; CHECK-LABEL: zext_v32i8_v32i32: 681; CHECK: // %bb.0: 682; CHECK-NEXT: ptrue p0.b, vl32 683; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 684; CHECK-NEXT: ptrue p0.s, vl32 685; CHECK-NEXT: add z0.b, z0.b, z0.b 686; CHECK-NEXT: uunpklo z0.h, z0.b 687; CHECK-NEXT: uunpklo z0.s, z0.h 688; CHECK-NEXT: st1w { z0.s }, p0, [x1] 689; CHECK-NEXT: ret 690 %a = load <32 x i8>, ptr %in 691 %b = add <32 x i8> %a, %a 692 %c = zext <32 x i8> %b to <32 x i32> 693 store <32 x i32> %c, ptr %out 694 ret void 695} 696 697define void @zext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 698; CHECK-LABEL: zext_v64i8_v64i32: 699; CHECK: // %bb.0: 700; CHECK-NEXT: ptrue p0.b, vl64 701; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 702; CHECK-NEXT: ptrue p0.s, vl64 703; CHECK-NEXT: add z0.b, z0.b, z0.b 704; CHECK-NEXT: uunpklo z0.h, z0.b 705; CHECK-NEXT: uunpklo z0.s, z0.h 706; CHECK-NEXT: st1w { z0.s }, p0, [x1] 707; CHECK-NEXT: ret 708 %a = load <64 x i8>, ptr %in 709 %b = add <64 x i8> %a, %a 710 %c = zext <64 x i8> %b to <64 x i32> 711 store <64 x i32> %c, ptr %out 712 ret void 713} 714 715; 716; zext i8 -> i64 717; 718 719; NOTE: v4i8 is an unpacked typed stored within a v4i16 container. The zero 720; extend is a two step process where the container is zero_extend_inreg'd with 721; the result feeding a normal zero extend from halfs to doublewords. 722define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) vscale_range(2,0) #0 { 723; CHECK-LABEL: zext_v4i8_v4i64: 724; CHECK: // %bb.0: 725; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 726; CHECK-NEXT: ptrue p0.d, vl4 727; CHECK-NEXT: bic v0.4h, #255, lsl #8 728; CHECK-NEXT: uunpklo z0.s, z0.h 729; CHECK-NEXT: uunpklo z0.d, z0.s 730; CHECK-NEXT: st1d { z0.d }, p0, [x0] 731; CHECK-NEXT: ret 732 %b = zext <4 x i8> %a to <4 x i64> 733 store <4 x i64>%b, ptr %out 734 ret void 735} 736 737define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) #0 { 738; VBITS_GE_256-LABEL: zext_v8i8_v8i64: 739; VBITS_GE_256: // %bb.0: 740; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 741; VBITS_GE_256-NEXT: ptrue p0.d, vl4 742; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 743; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 744; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 745; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 746; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 747; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 748; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] 749; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] 750; VBITS_GE_256-NEXT: ret 751; 752; VBITS_GE_512-LABEL: zext_v8i8_v8i64: 753; VBITS_GE_512: // %bb.0: 754; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 755; VBITS_GE_512-NEXT: ptrue p0.d, vl8 756; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 757; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 758; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 759; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 760; VBITS_GE_512-NEXT: ret 761 %b = zext <8 x i8> %a to <8 x i64> 762 store <8 x i64>%b, ptr %out 763 ret void 764} 765 766define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) vscale_range(8,0) #0 { 767; CHECK-LABEL: zext_v16i8_v16i64: 768; CHECK: // %bb.0: 769; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 770; CHECK-NEXT: ptrue p0.d, vl16 771; CHECK-NEXT: uunpklo z0.h, z0.b 772; CHECK-NEXT: uunpklo z0.s, z0.h 773; CHECK-NEXT: uunpklo z0.d, z0.s 774; CHECK-NEXT: st1d { z0.d }, p0, [x0] 775; CHECK-NEXT: ret 776 %b = zext <16 x i8> %a to <16 x i64> 777 store <16 x i64> %b, ptr %out 778 ret void 779} 780 781define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 782; CHECK-LABEL: zext_v32i8_v32i64: 783; CHECK: // %bb.0: 784; CHECK-NEXT: ptrue p0.b, vl32 785; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 786; CHECK-NEXT: ptrue p0.d, vl32 787; CHECK-NEXT: add z0.b, z0.b, z0.b 788; CHECK-NEXT: uunpklo z0.h, z0.b 789; CHECK-NEXT: uunpklo z0.s, z0.h 790; CHECK-NEXT: uunpklo z0.d, z0.s 791; CHECK-NEXT: st1d { z0.d }, p0, [x1] 792; CHECK-NEXT: ret 793 %a = load <32 x i8>, ptr %in 794 %b = add <32 x i8> %a, %a 795 %c = zext <32 x i8> %b to <32 x i64> 796 store <32 x i64> %c, ptr %out 797 ret void 798} 799 800; 801; zext i16 -> i32 802; 803 804define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { 805; CHECK-LABEL: zext_v8i16_v8i32: 806; CHECK: // %bb.0: 807; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 808; CHECK-NEXT: ptrue p0.s, vl8 809; CHECK-NEXT: uunpklo z0.s, z0.h 810; CHECK-NEXT: st1w { z0.s }, p0, [x0] 811; CHECK-NEXT: ret 812 %b = zext <8 x i16> %a to <8 x i32> 813 store <8 x i32>%b, ptr %out 814 ret void 815} 816 817define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 { 818; VBITS_GE_256-LABEL: zext_v16i16_v16i32: 819; VBITS_GE_256: // %bb.0: 820; VBITS_GE_256-NEXT: ptrue p0.h, vl16 821; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 822; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 823; VBITS_GE_256-NEXT: ptrue p0.s, vl8 824; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h 825; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h 826; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 827; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 828; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] 829; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] 830; VBITS_GE_256-NEXT: ret 831; 832; VBITS_GE_512-LABEL: zext_v16i16_v16i32: 833; VBITS_GE_512: // %bb.0: 834; VBITS_GE_512-NEXT: ptrue p0.h, vl16 835; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 836; VBITS_GE_512-NEXT: ptrue p0.s, vl16 837; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h 838; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 839; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 840; VBITS_GE_512-NEXT: ret 841 %a = load <16 x i16>, ptr %in 842 %b = add <16 x i16> %a, %a 843 %c = zext <16 x i16> %b to <16 x i32> 844 store <16 x i32> %c, ptr %out 845 ret void 846} 847 848define void @zext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 849; CHECK-LABEL: zext_v32i16_v32i32: 850; CHECK: // %bb.0: 851; CHECK-NEXT: ptrue p0.h, vl32 852; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 853; CHECK-NEXT: ptrue p0.s, vl32 854; CHECK-NEXT: add z0.h, z0.h, z0.h 855; CHECK-NEXT: uunpklo z0.s, z0.h 856; CHECK-NEXT: st1w { z0.s }, p0, [x1] 857; CHECK-NEXT: ret 858 %a = load <32 x i16>, ptr %in 859 %b = add <32 x i16> %a, %a 860 %c = zext <32 x i16> %b to <32 x i32> 861 store <32 x i32> %c, ptr %out 862 ret void 863} 864 865define void @zext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 866; CHECK-LABEL: zext_v64i16_v64i32: 867; CHECK: // %bb.0: 868; CHECK-NEXT: ptrue p0.h, vl64 869; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 870; CHECK-NEXT: ptrue p0.s, vl64 871; CHECK-NEXT: add z0.h, z0.h, z0.h 872; CHECK-NEXT: uunpklo z0.s, z0.h 873; CHECK-NEXT: st1w { z0.s }, p0, [x1] 874; CHECK-NEXT: ret 875 %a = load <64 x i16>, ptr %in 876 %b = add <64 x i16> %a, %a 877 %c = zext <64 x i16> %b to <64 x i32> 878 store <64 x i32> %c, ptr %out 879 ret void 880} 881 882; 883; zext i16 -> i64 884; 885 886define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) vscale_range(2,0) #0 { 887; CHECK-LABEL: zext_v4i16_v4i64: 888; CHECK: // %bb.0: 889; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 890; CHECK-NEXT: ptrue p0.d, vl4 891; CHECK-NEXT: uunpklo z0.s, z0.h 892; CHECK-NEXT: uunpklo z0.d, z0.s 893; CHECK-NEXT: st1d { z0.d }, p0, [x0] 894; CHECK-NEXT: ret 895 %b = zext <4 x i16> %a to <4 x i64> 896 store <4 x i64>%b, ptr %out 897 ret void 898} 899 900define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) #0 { 901; VBITS_GE_256-LABEL: zext_v8i16_v8i64: 902; VBITS_GE_256: // %bb.0: 903; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 904; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 905; VBITS_GE_256-NEXT: ptrue p0.d, vl4 906; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 907; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 908; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 909; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 910; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 911; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] 912; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] 913; VBITS_GE_256-NEXT: ret 914; 915; VBITS_GE_512-LABEL: zext_v8i16_v8i64: 916; VBITS_GE_512: // %bb.0: 917; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 918; VBITS_GE_512-NEXT: ptrue p0.d, vl8 919; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 920; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 921; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 922; VBITS_GE_512-NEXT: ret 923 %b = zext <8 x i16> %a to <8 x i64> 924 store <8 x i64>%b, ptr %out 925 ret void 926} 927 928define void @zext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { 929; CHECK-LABEL: zext_v16i16_v16i64: 930; CHECK: // %bb.0: 931; CHECK-NEXT: ptrue p0.h, vl16 932; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 933; CHECK-NEXT: ptrue p0.d, vl16 934; CHECK-NEXT: add z0.h, z0.h, z0.h 935; CHECK-NEXT: uunpklo z0.s, z0.h 936; CHECK-NEXT: uunpklo z0.d, z0.s 937; CHECK-NEXT: st1d { z0.d }, p0, [x1] 938; CHECK-NEXT: ret 939 %a = load <16 x i16>, ptr %in 940 %b = add <16 x i16> %a, %a 941 %c = zext <16 x i16> %b to <16 x i64> 942 store <16 x i64> %c, ptr %out 943 ret void 944} 945 946define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 947; CHECK-LABEL: zext_v32i16_v32i64: 948; CHECK: // %bb.0: 949; CHECK-NEXT: ptrue p0.h, vl32 950; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 951; CHECK-NEXT: ptrue p0.d, vl32 952; CHECK-NEXT: add z0.h, z0.h, z0.h 953; CHECK-NEXT: uunpklo z0.s, z0.h 954; CHECK-NEXT: uunpklo z0.d, z0.s 955; CHECK-NEXT: st1d { z0.d }, p0, [x1] 956; CHECK-NEXT: ret 957 %a = load <32 x i16>, ptr %in 958 %b = add <32 x i16> %a, %a 959 %c = zext <32 x i16> %b to <32 x i64> 960 store <32 x i64> %c, ptr %out 961 ret void 962} 963 964; 965; zext i32 -> i64 966; 967 968define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { 969; CHECK-LABEL: zext_v4i32_v4i64: 970; CHECK: // %bb.0: 971; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 972; CHECK-NEXT: ptrue p0.d, vl4 973; CHECK-NEXT: uunpklo z0.d, z0.s 974; CHECK-NEXT: st1d { z0.d }, p0, [x0] 975; CHECK-NEXT: ret 976 %b = zext <4 x i32> %a to <4 x i64> 977 store <4 x i64>%b, ptr %out 978 ret void 979} 980 981define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 { 982; VBITS_GE_256-LABEL: zext_v8i32_v8i64: 983; VBITS_GE_256: // %bb.0: 984; VBITS_GE_256-NEXT: ptrue p0.s, vl8 985; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 986; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 987; VBITS_GE_256-NEXT: ptrue p0.d, vl4 988; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s 989; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s 990; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 991; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 992; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 993; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 994; VBITS_GE_256-NEXT: ret 995; 996; VBITS_GE_512-LABEL: zext_v8i32_v8i64: 997; VBITS_GE_512: // %bb.0: 998; VBITS_GE_512-NEXT: ptrue p0.s, vl8 999; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1000; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1001; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s 1002; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 1003; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 1004; VBITS_GE_512-NEXT: ret 1005 %a = load <8 x i32>, ptr %in 1006 %b = add <8 x i32> %a, %a 1007 %c = zext <8 x i32> %b to <8 x i64> 1008 store <8 x i64> %c, ptr %out 1009 ret void 1010} 1011 1012define void @zext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { 1013; CHECK-LABEL: zext_v16i32_v16i64: 1014; CHECK: // %bb.0: 1015; CHECK-NEXT: ptrue p0.s, vl16 1016; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1017; CHECK-NEXT: ptrue p0.d, vl16 1018; CHECK-NEXT: add z0.s, z0.s, z0.s 1019; CHECK-NEXT: uunpklo z0.d, z0.s 1020; CHECK-NEXT: st1d { z0.d }, p0, [x1] 1021; CHECK-NEXT: ret 1022 %a = load <16 x i32>, ptr %in 1023 %b = add <16 x i32> %a, %a 1024 %c = zext <16 x i32> %b to <16 x i64> 1025 store <16 x i64> %c, ptr %out 1026 ret void 1027} 1028 1029define void @zext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 1030; CHECK-LABEL: zext_v32i32_v32i64: 1031; CHECK: // %bb.0: 1032; CHECK-NEXT: ptrue p0.s, vl32 1033; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1034; CHECK-NEXT: ptrue p0.d, vl32 1035; CHECK-NEXT: add z0.s, z0.s, z0.s 1036; CHECK-NEXT: uunpklo z0.d, z0.s 1037; CHECK-NEXT: st1d { z0.d }, p0, [x1] 1038; CHECK-NEXT: ret 1039 %a = load <32 x i32>, ptr %in 1040 %b = add <32 x i32> %a, %a 1041 %c = zext <32 x i32> %b to <32 x i64> 1042 store <32 x i64> %c, ptr %out 1043 ret void 1044} 1045 1046attributes #0 = { nounwind "target-features"="+sve" } 1047