1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; truncate i16 -> i8 10; 11 12define <16 x i8> @trunc_v16i16_v16i8(ptr %in) vscale_range(2,0) #0 { 13; CHECK-LABEL: trunc_v16i16_v16i8: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ptrue p0.h, vl16 16; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 17; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 18; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 19; CHECK-NEXT: ret 20 %a = load <16 x i16>, ptr %in 21 %b = trunc <16 x i16> %a to <16 x i8> 22 ret <16 x i8> %b 23} 24 25; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 26define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 { 27; VBITS_GE_256-LABEL: trunc_v32i16_v32i8: 28; VBITS_GE_256: // %bb.0: 29; VBITS_GE_256-NEXT: ptrue p0.h, vl16 30; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 31; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 32; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 33; VBITS_GE_256-NEXT: ptrue p0.b, vl16 34; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 35; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 36; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b 37; VBITS_GE_256-NEXT: ptrue p0.b, vl32 38; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b 39; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] 40; VBITS_GE_256-NEXT: ret 41; 42; VBITS_GE_512-LABEL: trunc_v32i16_v32i8: 43; VBITS_GE_512: // %bb.0: 44; VBITS_GE_512-NEXT: ptrue p0.h, vl32 45; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 46; VBITS_GE_512-NEXT: ptrue p0.b, vl32 47; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 48; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b 49; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] 50; VBITS_GE_512-NEXT: ret 51 %a = load <32 x i16>, ptr %in 52 %b = trunc <32 x i16> %a to <32 x i8> 53 %c = add <32 x i8> %b, %b 54 store <32 x i8> %c, ptr %out 55 ret void 56} 57 58; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 59define void @trunc_v64i16_v64i8(ptr %in, ptr %out) vscale_range(8,0) #0 { 60; CHECK-LABEL: trunc_v64i16_v64i8: 61; CHECK: // %bb.0: 62; CHECK-NEXT: ptrue p0.h, vl64 63; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 64; CHECK-NEXT: ptrue p0.b, vl64 65; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 66; CHECK-NEXT: add z0.b, z0.b, z0.b 67; CHECK-NEXT: st1b { z0.b }, p0, [x1] 68; CHECK-NEXT: ret 69 %a = load <64 x i16>, ptr %in 70 %b = trunc <64 x i16> %a to <64 x i8> 71 %c = add <64 x i8> %b, %b 72 store <64 x i8> %c, ptr %out 73 ret void 74} 75 76; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 77define void @trunc_v128i16_v128i8(ptr %in, ptr %out) vscale_range(16,0) #0 { 78; CHECK-LABEL: trunc_v128i16_v128i8: 79; CHECK: // %bb.0: 80; CHECK-NEXT: ptrue p0.h, vl128 81; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 82; CHECK-NEXT: ptrue p0.b, vl128 83; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 84; CHECK-NEXT: add z0.b, z0.b, z0.b 85; CHECK-NEXT: st1b { z0.b }, p0, [x1] 86; CHECK-NEXT: ret 87 %a = load <128 x i16>, ptr %in 88 %b = trunc <128 x i16> %a to <128 x i8> 89 %c = add <128 x i8> %b, %b 90 store <128 x i8> %c, ptr %out 91 ret void 92} 93 94; 95; truncate i32 -> i8 96; 97 98define <8 x i8> @trunc_v8i32_v8i8(ptr %in) vscale_range(2,0) #0 { 99; CHECK-LABEL: trunc_v8i32_v8i8: 100; CHECK: // %bb.0: 101; CHECK-NEXT: ptrue p0.s, vl8 102; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 103; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 104; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 105; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 106; CHECK-NEXT: ret 107 %a = load <8 x i32>, ptr %in 108 %b = trunc <8 x i32> %a to <8 x i8> 109 ret <8 x i8> %b 110} 111 112define <16 x i8> @trunc_v16i32_v16i8(ptr %in) #0 { 113; VBITS_GE_256-LABEL: trunc_v16i32_v16i8: 114; VBITS_GE_256: // %bb.0: 115; VBITS_GE_256-NEXT: ptrue p0.s, vl8 116; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 117; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 118; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 119; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 120; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 121; VBITS_GE_256-NEXT: uzp1 z2.b, z0.b, z0.b 122; VBITS_GE_256-NEXT: uzp1 z0.b, z1.b, z1.b 123; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0] 124; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 125; VBITS_GE_256-NEXT: ret 126; 127; VBITS_GE_512-LABEL: trunc_v16i32_v16i8: 128; VBITS_GE_512: // %bb.0: 129; VBITS_GE_512-NEXT: ptrue p0.s, vl16 130; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 131; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 132; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 133; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 134; VBITS_GE_512-NEXT: ret 135 %a = load <16 x i32>, ptr %in 136 %b = trunc <16 x i32> %a to <16 x i8> 137 ret <16 x i8> %b 138} 139 140; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 141define void @trunc_v32i32_v32i8(ptr %in, ptr %out) vscale_range(8,0) #0 { 142; CHECK-LABEL: trunc_v32i32_v32i8: 143; CHECK: // %bb.0: 144; CHECK-NEXT: ptrue p0.s, vl32 145; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 146; CHECK-NEXT: ptrue p0.b, vl32 147; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 148; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 149; CHECK-NEXT: add z0.b, z0.b, z0.b 150; CHECK-NEXT: st1b { z0.b }, p0, [x1] 151; CHECK-NEXT: ret 152 %a = load <32 x i32>, ptr %in 153 %b = trunc <32 x i32> %a to <32 x i8> 154 %c = add <32 x i8> %b, %b 155 store <32 x i8> %c, ptr %out 156 ret void 157} 158 159; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 160define void @trunc_v64i32_v64i8(ptr %in, ptr %out) vscale_range(16,0) #0 { 161; CHECK-LABEL: trunc_v64i32_v64i8: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ptrue p0.s, vl64 164; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 165; CHECK-NEXT: ptrue p0.b, vl64 166; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 167; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 168; CHECK-NEXT: add z0.b, z0.b, z0.b 169; CHECK-NEXT: st1b { z0.b }, p0, [x1] 170; CHECK-NEXT: ret 171 %a = load <64 x i32>, ptr %in 172 %b = trunc <64 x i32> %a to <64 x i8> 173 %c = add <64 x i8> %b, %b 174 store <64 x i8> %c, ptr %out 175 ret void 176} 177 178; 179; truncate i32 -> i16 180; 181 182define <8 x i16> @trunc_v8i32_v8i16(ptr %in) vscale_range(2,0) #0 { 183; CHECK-LABEL: trunc_v8i32_v8i16: 184; CHECK: // %bb.0: 185; CHECK-NEXT: ptrue p0.s, vl8 186; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 187; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 188; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 189; CHECK-NEXT: ret 190 %a = load <8 x i32>, ptr %in 191 %b = trunc <8 x i32> %a to <8 x i16> 192 ret <8 x i16> %b 193} 194 195; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 196define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 { 197; VBITS_GE_256-LABEL: trunc_v16i32_v16i16: 198; VBITS_GE_256: // %bb.0: 199; VBITS_GE_256-NEXT: ptrue p0.s, vl8 200; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 201; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 202; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 203; VBITS_GE_256-NEXT: ptrue p0.h, vl8 204; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 205; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 206; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h 207; VBITS_GE_256-NEXT: ptrue p0.h, vl16 208; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h 209; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] 210; VBITS_GE_256-NEXT: ret 211; 212; VBITS_GE_512-LABEL: trunc_v16i32_v16i16: 213; VBITS_GE_512: // %bb.0: 214; VBITS_GE_512-NEXT: ptrue p0.s, vl16 215; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 216; VBITS_GE_512-NEXT: ptrue p0.h, vl16 217; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 218; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h 219; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 220; VBITS_GE_512-NEXT: ret 221 %a = load <16 x i32>, ptr %in 222 %b = trunc <16 x i32> %a to <16 x i16> 223 %c = add <16 x i16> %b, %b 224 store <16 x i16> %c, ptr %out 225 ret void 226} 227 228; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 229define void @trunc_v32i32_v32i16(ptr %in, ptr %out) vscale_range(8,0) #0 { 230; CHECK-LABEL: trunc_v32i32_v32i16: 231; CHECK: // %bb.0: 232; CHECK-NEXT: ptrue p0.s, vl32 233; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 234; CHECK-NEXT: ptrue p0.h, vl32 235; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 236; CHECK-NEXT: add z0.h, z0.h, z0.h 237; CHECK-NEXT: st1h { z0.h }, p0, [x1] 238; CHECK-NEXT: ret 239 %a = load <32 x i32>, ptr %in 240 %b = trunc <32 x i32> %a to <32 x i16> 241 %c = add <32 x i16> %b, %b 242 store <32 x i16> %c, ptr %out 243 ret void 244} 245 246; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 247define void @trunc_v64i32_v64i16(ptr %in, ptr %out) vscale_range(16,0) #0 { 248; CHECK-LABEL: trunc_v64i32_v64i16: 249; CHECK: // %bb.0: 250; CHECK-NEXT: ptrue p0.s, vl64 251; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 252; CHECK-NEXT: ptrue p0.h, vl64 253; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 254; CHECK-NEXT: add z0.h, z0.h, z0.h 255; CHECK-NEXT: st1h { z0.h }, p0, [x1] 256; CHECK-NEXT: ret 257 %a = load <64 x i32>, ptr %in 258 %b = trunc <64 x i32> %a to <64 x i16> 259 %c = add <64 x i16> %b, %b 260 store <64 x i16> %c, ptr %out 261 ret void 262} 263 264; 265; truncate i64 -> i8 266; 267 268; NOTE: v4i8 is not legal so result i8 elements are held within i16 containers. 269define <4 x i8> @trunc_v4i64_v4i8(ptr %in) vscale_range(2,0) #0 { 270; CHECK-LABEL: trunc_v4i64_v4i8: 271; CHECK: // %bb.0: 272; CHECK-NEXT: ptrue p0.d, vl4 273; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 274; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 275; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 276; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 277; CHECK-NEXT: ret 278 %a = load <4 x i64>, ptr %in 279 %b = trunc <4 x i64> %a to <4 x i8> 280 ret <4 x i8> %b 281} 282 283define <8 x i8> @trunc_v8i64_v8i8(ptr %in) #0 { 284; VBITS_GE_256-LABEL: trunc_v8i64_v8i8: 285; VBITS_GE_256: // %bb.0: 286; VBITS_GE_256-NEXT: ptrue p0.d, vl4 287; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 288; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 289; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 290; VBITS_GE_256-NEXT: ptrue p0.s, vl4 291; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 292; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 293; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s 294; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 295; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 296; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0 297; VBITS_GE_256-NEXT: ret 298; 299; VBITS_GE_512-LABEL: trunc_v8i64_v8i8: 300; VBITS_GE_512: // %bb.0: 301; VBITS_GE_512-NEXT: ptrue p0.d, vl8 302; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 303; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 304; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 305; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 306; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0 307; VBITS_GE_512-NEXT: ret 308 %a = load <8 x i64>, ptr %in 309 %b = trunc <8 x i64> %a to <8 x i8> 310 ret <8 x i8> %b 311} 312 313define <16 x i8> @trunc_v16i64_v16i8(ptr %in) vscale_range(8,0) #0 { 314; CHECK-LABEL: trunc_v16i64_v16i8: 315; CHECK: // %bb.0: 316; CHECK-NEXT: ptrue p0.d, vl16 317; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 318; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 319; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 320; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 321; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 322; CHECK-NEXT: ret 323 %a = load <16 x i64>, ptr %in 324 %b = trunc <16 x i64> %a to <16 x i8> 325 ret <16 x i8> %b 326} 327 328; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 329define void @trunc_v32i64_v32i8(ptr %in, ptr %out) vscale_range(16,0) #0 { 330; CHECK-LABEL: trunc_v32i64_v32i8: 331; CHECK: // %bb.0: 332; CHECK-NEXT: ptrue p0.d, vl32 333; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 334; CHECK-NEXT: ptrue p0.b, vl32 335; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 336; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 337; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 338; CHECK-NEXT: add z0.b, z0.b, z0.b 339; CHECK-NEXT: st1b { z0.b }, p0, [x1] 340; CHECK-NEXT: ret 341 %a = load <32 x i64>, ptr %in 342 %b = trunc <32 x i64> %a to <32 x i8> 343 %c = add <32 x i8> %b, %b 344 store <32 x i8> %c, ptr %out 345 ret void 346} 347 348; 349; truncate i64 -> i16 350; 351 352define <4 x i16> @trunc_v4i64_v4i16(ptr %in) vscale_range(2,0) #0 { 353; CHECK-LABEL: trunc_v4i64_v4i16: 354; CHECK: // %bb.0: 355; CHECK-NEXT: ptrue p0.d, vl4 356; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 357; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 358; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 359; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 360; CHECK-NEXT: ret 361 %a = load <4 x i64>, ptr %in 362 %b = trunc <4 x i64> %a to <4 x i16> 363 ret <4 x i16> %b 364} 365 366define <8 x i16> @trunc_v8i64_v8i16(ptr %in) #0 { 367; VBITS_GE_256-LABEL: trunc_v8i64_v8i16: 368; VBITS_GE_256: // %bb.0: 369; VBITS_GE_256-NEXT: ptrue p0.d, vl4 370; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 371; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 372; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 373; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 374; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 375; VBITS_GE_256-NEXT: uzp1 z2.h, z0.h, z0.h 376; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 377; VBITS_GE_256-NEXT: mov v0.d[1], v2.d[0] 378; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 killed $z0 379; VBITS_GE_256-NEXT: ret 380; 381; VBITS_GE_512-LABEL: trunc_v8i64_v8i16: 382; VBITS_GE_512: // %bb.0: 383; VBITS_GE_512-NEXT: ptrue p0.d, vl8 384; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 385; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 386; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 387; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 killed $z0 388; VBITS_GE_512-NEXT: ret 389 %a = load <8 x i64>, ptr %in 390 %b = trunc <8 x i64> %a to <8 x i16> 391 ret <8 x i16> %b 392} 393 394; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 395define void @trunc_v16i64_v16i16(ptr %in, ptr %out) vscale_range(8,0) #0 { 396; CHECK-LABEL: trunc_v16i64_v16i16: 397; CHECK: // %bb.0: 398; CHECK-NEXT: ptrue p0.d, vl16 399; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 400; CHECK-NEXT: ptrue p0.h, vl16 401; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 402; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 403; CHECK-NEXT: add z0.h, z0.h, z0.h 404; CHECK-NEXT: st1h { z0.h }, p0, [x1] 405; CHECK-NEXT: ret 406 %a = load <16 x i64>, ptr %in 407 %b = trunc <16 x i64> %a to <16 x i16> 408 %c = add <16 x i16> %b, %b 409 store <16 x i16> %c, ptr %out 410 ret void 411} 412 413; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 414define void @trunc_v32i64_v32i16(ptr %in, ptr %out) vscale_range(16,0) #0 { 415; CHECK-LABEL: trunc_v32i64_v32i16: 416; CHECK: // %bb.0: 417; CHECK-NEXT: ptrue p0.d, vl32 418; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 419; CHECK-NEXT: ptrue p0.h, vl32 420; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 421; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 422; CHECK-NEXT: add z0.h, z0.h, z0.h 423; CHECK-NEXT: st1h { z0.h }, p0, [x1] 424; CHECK-NEXT: ret 425 %a = load <32 x i64>, ptr %in 426 %b = trunc <32 x i64> %a to <32 x i16> 427 %c = add <32 x i16> %b, %b 428 store <32 x i16> %c, ptr %out 429 ret void 430} 431 432; 433; truncate i64 -> i32 434; 435 436define <4 x i32> @trunc_v4i64_v4i32(ptr %in) vscale_range(2,0) #0 { 437; CHECK-LABEL: trunc_v4i64_v4i32: 438; CHECK: // %bb.0: 439; CHECK-NEXT: ptrue p0.d, vl4 440; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 441; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 442; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 443; CHECK-NEXT: ret 444 %a = load <4 x i64>, ptr %in 445 %b = trunc <4 x i64> %a to <4 x i32> 446 ret <4 x i32> %b 447} 448 449; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 450define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 { 451; VBITS_GE_256-LABEL: trunc_v8i64_v8i32: 452; VBITS_GE_256: // %bb.0: 453; VBITS_GE_256-NEXT: ptrue p0.d, vl4 454; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 455; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 456; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 457; VBITS_GE_256-NEXT: ptrue p0.s, vl4 458; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 459; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 460; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s 461; VBITS_GE_256-NEXT: ptrue p0.s, vl8 462; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s 463; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] 464; VBITS_GE_256-NEXT: ret 465; 466; VBITS_GE_512-LABEL: trunc_v8i64_v8i32: 467; VBITS_GE_512: // %bb.0: 468; VBITS_GE_512-NEXT: ptrue p0.d, vl8 469; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 470; VBITS_GE_512-NEXT: ptrue p0.s, vl8 471; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 472; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s 473; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 474; VBITS_GE_512-NEXT: ret 475 %a = load <8 x i64>, ptr %in 476 %b = trunc <8 x i64> %a to <8 x i32> 477 %c = add <8 x i32> %b, %b 478 store <8 x i32> %c, ptr %out 479 ret void 480} 481 482; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 483define void @trunc_v16i64_v16i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 484; CHECK-LABEL: trunc_v16i64_v16i32: 485; CHECK: // %bb.0: 486; CHECK-NEXT: ptrue p0.d, vl16 487; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 488; CHECK-NEXT: ptrue p0.s, vl16 489; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 490; CHECK-NEXT: add z0.s, z0.s, z0.s 491; CHECK-NEXT: st1w { z0.s }, p0, [x1] 492; CHECK-NEXT: ret 493 %a = load <16 x i64>, ptr %in 494 %b = trunc <16 x i64> %a to <16 x i32> 495 %c = add <16 x i32> %b, %b 496 store <16 x i32> %c, ptr %out 497 ret void 498} 499 500; NOTE: Extra 'add' is to prevent the truncate being combined with the store. 501define void @trunc_v32i64_v32i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 502; CHECK-LABEL: trunc_v32i64_v32i32: 503; CHECK: // %bb.0: 504; CHECK-NEXT: ptrue p0.d, vl32 505; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 506; CHECK-NEXT: ptrue p0.s, vl32 507; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 508; CHECK-NEXT: add z0.s, z0.s, z0.s 509; CHECK-NEXT: st1w { z0.s }, p0, [x1] 510; CHECK-NEXT: ret 511 %a = load <32 x i64>, ptr %in 512 %b = trunc <32 x i64> %a to <32 x i32> 513 %c = add <32 x i32> %b, %b 514 store <32 x i32> %c, ptr %out 515 ret void 516} 517 518attributes #0 = { nounwind "target-features"="+sve" } 519