1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; FCVT H -> S 10; 11 12; Don't use SVE for 64-bit vectors. 13define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 14; CHECK-LABEL: fcvt_v2f16_v2f32: 15; CHECK: // %bb.0: 16; CHECK-NEXT: ldr s0, [x0] 17; CHECK-NEXT: fcvtl v0.4s, v0.4h 18; CHECK-NEXT: str d0, [x1] 19; CHECK-NEXT: ret 20 %op1 = load <2 x half>, ptr %a 21 %res = fpext <2 x half> %op1 to <2 x float> 22 store <2 x float> %res, ptr %b 23 ret void 24} 25 26; Don't use SVE for 128-bit vectors. 27define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 28; CHECK-LABEL: fcvt_v4f16_v4f32: 29; CHECK: // %bb.0: 30; CHECK-NEXT: ldr d0, [x0] 31; CHECK-NEXT: fcvtl v0.4s, v0.4h 32; CHECK-NEXT: str q0, [x1] 33; CHECK-NEXT: ret 34 %op1 = load <4 x half>, ptr %a 35 %res = fpext <4 x half> %op1 to <4 x float> 36 store <4 x float> %res, ptr %b 37 ret void 38} 39 40define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 41; CHECK-LABEL: fcvt_v8f16_v8f32: 42; CHECK: // %bb.0: 43; CHECK-NEXT: ptrue p0.s, vl8 44; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] 45; CHECK-NEXT: fcvt z0.s, p0/m, z0.h 46; CHECK-NEXT: st1w { z0.s }, p0, [x1] 47; CHECK-NEXT: ret 48 %op1 = load <8 x half>, ptr %a 49 %res = fpext <8 x half> %op1 to <8 x float> 50 store <8 x float> %res, ptr %b 51 ret void 52} 53 54define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) #0 { 55; VBITS_GE_256-LABEL: fcvt_v16f16_v16f32: 56; VBITS_GE_256: // %bb.0: 57; VBITS_GE_256-NEXT: ptrue p0.s, vl8 58; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 59; VBITS_GE_256-NEXT: ld1h { z0.s }, p0/z, [x0, x8, lsl #1] 60; VBITS_GE_256-NEXT: ld1h { z1.s }, p0/z, [x0] 61; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.h 62; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.h 63; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] 64; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] 65; VBITS_GE_256-NEXT: ret 66; 67; VBITS_GE_512-LABEL: fcvt_v16f16_v16f32: 68; VBITS_GE_512: // %bb.0: 69; VBITS_GE_512-NEXT: ptrue p0.s, vl16 70; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] 71; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.h 72; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 73; VBITS_GE_512-NEXT: ret 74 %op1 = load <16 x half>, ptr %a 75 %res = fpext <16 x half> %op1 to <16 x float> 76 store <16 x float> %res, ptr %b 77 ret void 78} 79 80define void @fcvt_v32f16_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 81; CHECK-LABEL: fcvt_v32f16_v32f32: 82; CHECK: // %bb.0: 83; CHECK-NEXT: ptrue p0.s, vl32 84; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] 85; CHECK-NEXT: fcvt z0.s, p0/m, z0.h 86; CHECK-NEXT: st1w { z0.s }, p0, [x1] 87; CHECK-NEXT: ret 88 %op1 = load <32 x half>, ptr %a 89 %res = fpext <32 x half> %op1 to <32 x float> 90 store <32 x float> %res, ptr %b 91 ret void 92} 93 94define void @fcvt_v64f16_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 95; CHECK-LABEL: fcvt_v64f16_v64f32: 96; CHECK: // %bb.0: 97; CHECK-NEXT: ptrue p0.s, vl64 98; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] 99; CHECK-NEXT: fcvt z0.s, p0/m, z0.h 100; CHECK-NEXT: st1w { z0.s }, p0, [x1] 101; CHECK-NEXT: ret 102 %op1 = load <64 x half>, ptr %a 103 %res = fpext <64 x half> %op1 to <64 x float> 104 store <64 x float> %res, ptr %b 105 ret void 106} 107 108; 109; FCVT H -> D 110; 111 112; Don't use SVE for 64-bit vectors. 113define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 114; CHECK-LABEL: fcvt_v1f16_v1f64: 115; CHECK: // %bb.0: 116; CHECK-NEXT: ldr h0, [x0] 117; CHECK-NEXT: fcvt d0, h0 118; CHECK-NEXT: str d0, [x1] 119; CHECK-NEXT: ret 120 %op1 = load <1 x half>, ptr %a 121 %res = fpext <1 x half> %op1 to <1 x double> 122 store <1 x double> %res, ptr %b 123 ret void 124} 125 126; v2f16 is not legal for NEON, so use SVE 127define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 128; CHECK-LABEL: fcvt_v2f16_v2f64: 129; CHECK: // %bb.0: 130; CHECK-NEXT: ldr s0, [x0] 131; CHECK-NEXT: ptrue p0.d, vl4 132; CHECK-NEXT: uunpklo z0.s, z0.h 133; CHECK-NEXT: uunpklo z0.d, z0.s 134; CHECK-NEXT: fcvt z0.d, p0/m, z0.h 135; CHECK-NEXT: str q0, [x1] 136; CHECK-NEXT: ret 137 %op1 = load <2 x half>, ptr %a 138 %res = fpext <2 x half> %op1 to <2 x double> 139 store <2 x double> %res, ptr %b 140 ret void 141} 142 143define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 144; CHECK-LABEL: fcvt_v4f16_v4f64: 145; CHECK: // %bb.0: 146; CHECK-NEXT: ptrue p0.d, vl4 147; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] 148; CHECK-NEXT: fcvt z0.d, p0/m, z0.h 149; CHECK-NEXT: st1d { z0.d }, p0, [x1] 150; CHECK-NEXT: ret 151 %op1 = load <4 x half>, ptr %a 152 %res = fpext <4 x half> %op1 to <4 x double> 153 store <4 x double> %res, ptr %b 154 ret void 155} 156 157define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) #0 { 158; VBITS_GE_256-LABEL: fcvt_v8f16_v8f64: 159; VBITS_GE_256: // %bb.0: 160; VBITS_GE_256-NEXT: ptrue p0.d, vl4 161; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 162; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [x0, x8, lsl #1] 163; VBITS_GE_256-NEXT: ld1h { z1.d }, p0/z, [x0] 164; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.h 165; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.h 166; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 167; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 168; VBITS_GE_256-NEXT: ret 169; 170; VBITS_GE_512-LABEL: fcvt_v8f16_v8f64: 171; VBITS_GE_512: // %bb.0: 172; VBITS_GE_512-NEXT: ptrue p0.d, vl8 173; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x0] 174; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.h 175; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 176; VBITS_GE_512-NEXT: ret 177 %op1 = load <8 x half>, ptr %a 178 %res = fpext <8 x half> %op1 to <8 x double> 179 store <8 x double> %res, ptr %b 180 ret void 181} 182 183define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 184; CHECK-LABEL: fcvt_v16f16_v16f64: 185; CHECK: // %bb.0: 186; CHECK-NEXT: ptrue p0.d, vl16 187; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] 188; CHECK-NEXT: fcvt z0.d, p0/m, z0.h 189; CHECK-NEXT: st1d { z0.d }, p0, [x1] 190; CHECK-NEXT: ret 191 %op1 = load <16 x half>, ptr %a 192 %res = fpext <16 x half> %op1 to <16 x double> 193 store <16 x double> %res, ptr %b 194 ret void 195} 196 197define void @fcvt_v32f16_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 198; CHECK-LABEL: fcvt_v32f16_v32f64: 199; CHECK: // %bb.0: 200; CHECK-NEXT: ptrue p0.d, vl32 201; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] 202; CHECK-NEXT: fcvt z0.d, p0/m, z0.h 203; CHECK-NEXT: st1d { z0.d }, p0, [x1] 204; CHECK-NEXT: ret 205 %op1 = load <32 x half>, ptr %a 206 %res = fpext <32 x half> %op1 to <32 x double> 207 store <32 x double> %res, ptr %b 208 ret void 209} 210 211; 212; FCVT S -> D 213; 214 215; Don't use SVE for 64-bit vectors. 216define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 217; CHECK-LABEL: fcvt_v1f32_v1f64: 218; CHECK: // %bb.0: 219; CHECK-NEXT: ldr s0, [x0] 220; CHECK-NEXT: fcvtl v0.2d, v0.2s 221; CHECK-NEXT: str d0, [x1] 222; CHECK-NEXT: ret 223 %op1 = load <1 x float>, ptr %a 224 %res = fpext <1 x float> %op1 to <1 x double> 225 store <1 x double> %res, ptr %b 226 ret void 227} 228 229; Don't use SVE for 128-bit vectors. 230define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 231; CHECK-LABEL: fcvt_v2f32_v2f64: 232; CHECK: // %bb.0: 233; CHECK-NEXT: ldr d0, [x0] 234; CHECK-NEXT: fcvtl v0.2d, v0.2s 235; CHECK-NEXT: str q0, [x1] 236; CHECK-NEXT: ret 237 %op1 = load <2 x float>, ptr %a 238 %res = fpext <2 x float> %op1 to <2 x double> 239 store <2 x double> %res, ptr %b 240 ret void 241} 242 243define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 244; CHECK-LABEL: fcvt_v4f32_v4f64: 245; CHECK: // %bb.0: 246; CHECK-NEXT: ptrue p0.d, vl4 247; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] 248; CHECK-NEXT: fcvt z0.d, p0/m, z0.s 249; CHECK-NEXT: st1d { z0.d }, p0, [x1] 250; CHECK-NEXT: ret 251 %op1 = load <4 x float>, ptr %a 252 %res = fpext <4 x float> %op1 to <4 x double> 253 store <4 x double> %res, ptr %b 254 ret void 255} 256 257define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) #0 { 258; VBITS_GE_256-LABEL: fcvt_v8f32_v8f64: 259; VBITS_GE_256: // %bb.0: 260; VBITS_GE_256-NEXT: ptrue p0.d, vl4 261; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 262; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [x0, x8, lsl #2] 263; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [x0] 264; VBITS_GE_256-NEXT: fcvt z0.d, p0/m, z0.s 265; VBITS_GE_256-NEXT: fcvt z1.d, p0/m, z1.s 266; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 267; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 268; VBITS_GE_256-NEXT: ret 269; 270; VBITS_GE_512-LABEL: fcvt_v8f32_v8f64: 271; VBITS_GE_512: // %bb.0: 272; VBITS_GE_512-NEXT: ptrue p0.d, vl8 273; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] 274; VBITS_GE_512-NEXT: fcvt z0.d, p0/m, z0.s 275; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 276; VBITS_GE_512-NEXT: ret 277 %op1 = load <8 x float>, ptr %a 278 %res = fpext <8 x float> %op1 to <8 x double> 279 store <8 x double> %res, ptr %b 280 ret void 281} 282 283define void @fcvt_v16f32_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 284; CHECK-LABEL: fcvt_v16f32_v16f64: 285; CHECK: // %bb.0: 286; CHECK-NEXT: ptrue p0.d, vl16 287; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] 288; CHECK-NEXT: fcvt z0.d, p0/m, z0.s 289; CHECK-NEXT: st1d { z0.d }, p0, [x1] 290; CHECK-NEXT: ret 291 %op1 = load <16 x float>, ptr %a 292 %res = fpext <16 x float> %op1 to <16 x double> 293 store <16 x double> %res, ptr %b 294 ret void 295} 296 297define void @fcvt_v32f32_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 298; CHECK-LABEL: fcvt_v32f32_v32f64: 299; CHECK: // %bb.0: 300; CHECK-NEXT: ptrue p0.d, vl32 301; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] 302; CHECK-NEXT: fcvt z0.d, p0/m, z0.s 303; CHECK-NEXT: st1d { z0.d }, p0, [x1] 304; CHECK-NEXT: ret 305 %op1 = load <32 x float>, ptr %a 306 %res = fpext <32 x float> %op1 to <32 x double> 307 store <32 x double> %res, ptr %b 308 ret void 309} 310 311; 312; FCVT S -> H 313; 314 315; Don't use SVE for 64-bit vectors. 316define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 317; CHECK-LABEL: fcvt_v2f32_v2f16: 318; CHECK: // %bb.0: 319; CHECK-NEXT: ldr d0, [x0] 320; CHECK-NEXT: fcvtn v0.4h, v0.4s 321; CHECK-NEXT: str s0, [x1] 322; CHECK-NEXT: ret 323 %op1 = load <2 x float>, ptr %a 324 %res = fptrunc <2 x float> %op1 to <2 x half> 325 store <2 x half> %res, ptr %b 326 ret void 327} 328 329; Don't use SVE for 128-bit vectors. 330define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 331; CHECK-LABEL: fcvt_v4f32_v4f16: 332; CHECK: // %bb.0: 333; CHECK-NEXT: ldr q0, [x0] 334; CHECK-NEXT: fcvtn v0.4h, v0.4s 335; CHECK-NEXT: str d0, [x1] 336; CHECK-NEXT: ret 337 %op1 = load <4 x float>, ptr %a 338 %res = fptrunc <4 x float> %op1 to <4 x half> 339 store <4 x half> %res, ptr %b 340 ret void 341} 342 343define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 344; CHECK-LABEL: fcvt_v8f32_v8f16: 345; CHECK: // %bb.0: 346; CHECK-NEXT: ptrue p0.s, vl8 347; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 348; CHECK-NEXT: fcvt z0.h, p0/m, z0.s 349; CHECK-NEXT: st1h { z0.s }, p0, [x1] 350; CHECK-NEXT: ret 351 %op1 = load <8 x float>, ptr %a 352 %res = fptrunc <8 x float> %op1 to <8 x half> 353 store <8 x half> %res, ptr %b 354 ret void 355} 356 357define void @fcvt_v16f32_v16f16(ptr %a, ptr %b) #0 { 358; VBITS_GE_256-LABEL: fcvt_v16f32_v16f16: 359; VBITS_GE_256: // %bb.0: 360; VBITS_GE_256-NEXT: ptrue p0.s, vl8 361; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 362; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 363; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 364; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.s 365; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.s 366; VBITS_GE_256-NEXT: st1h { z0.s }, p0, [x1, x8, lsl #1] 367; VBITS_GE_256-NEXT: st1h { z1.s }, p0, [x1] 368; VBITS_GE_256-NEXT: ret 369; 370; VBITS_GE_512-LABEL: fcvt_v16f32_v16f16: 371; VBITS_GE_512: // %bb.0: 372; VBITS_GE_512-NEXT: ptrue p0.s, vl16 373; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 374; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.s 375; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] 376; VBITS_GE_512-NEXT: ret 377 %op1 = load <16 x float>, ptr %a 378 %res = fptrunc <16 x float> %op1 to <16 x half> 379 store <16 x half> %res, ptr %b 380 ret void 381} 382 383define void @fcvt_v32f32_v32f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 384; CHECK-LABEL: fcvt_v32f32_v32f16: 385; CHECK: // %bb.0: 386; CHECK-NEXT: ptrue p0.s, vl32 387; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 388; CHECK-NEXT: fcvt z0.h, p0/m, z0.s 389; CHECK-NEXT: st1h { z0.s }, p0, [x1] 390; CHECK-NEXT: ret 391 %op1 = load <32 x float>, ptr %a 392 %res = fptrunc <32 x float> %op1 to <32 x half> 393 store <32 x half> %res, ptr %b 394 ret void 395} 396 397define void @fcvt_v64f32_v64f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 398; CHECK-LABEL: fcvt_v64f32_v64f16: 399; CHECK: // %bb.0: 400; CHECK-NEXT: ptrue p0.s, vl64 401; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 402; CHECK-NEXT: fcvt z0.h, p0/m, z0.s 403; CHECK-NEXT: st1h { z0.s }, p0, [x1] 404; CHECK-NEXT: ret 405 %op1 = load <64 x float>, ptr %a 406 %res = fptrunc <64 x float> %op1 to <64 x half> 407 store <64 x half> %res, ptr %b 408 ret void 409} 410 411; 412; FCVT D -> H 413; 414 415; Don't use SVE for 64-bit vectors. 416define void @fcvt_v1f64_v1f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 417; CHECK-LABEL: fcvt_v1f64_v1f16: 418; CHECK: // %bb.0: 419; CHECK-NEXT: ldr d0, [x0] 420; CHECK-NEXT: fcvt h0, d0 421; CHECK-NEXT: str h0, [x1] 422; CHECK-NEXT: ret 423 %op1 = load <1 x double>, ptr %a 424 %res = fptrunc <1 x double> %op1 to <1 x half> 425 store <1 x half> %res, ptr %b 426 ret void 427} 428 429; v2f16 is not legal for NEON, so use SVE 430define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 431; CHECK-LABEL: fcvt_v2f64_v2f16: 432; CHECK: // %bb.0: 433; CHECK-NEXT: ptrue p0.d 434; CHECK-NEXT: ldr q0, [x0] 435; CHECK-NEXT: fcvt z0.h, p0/m, z0.d 436; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 437; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 438; CHECK-NEXT: str s0, [x1] 439; CHECK-NEXT: ret 440 %op1 = load <2 x double>, ptr %a 441 %res = fptrunc <2 x double> %op1 to <2 x half> 442 store <2 x half> %res, ptr %b 443 ret void 444} 445 446define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 447; CHECK-LABEL: fcvt_v4f64_v4f16: 448; CHECK: // %bb.0: 449; CHECK-NEXT: ptrue p0.d, vl4 450; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 451; CHECK-NEXT: fcvt z0.h, p0/m, z0.d 452; CHECK-NEXT: st1h { z0.d }, p0, [x1] 453; CHECK-NEXT: ret 454 %op1 = load <4 x double>, ptr %a 455 %res = fptrunc <4 x double> %op1 to <4 x half> 456 store <4 x half> %res, ptr %b 457 ret void 458} 459 460define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 { 461; VBITS_GE_256-LABEL: fcvt_v8f64_v8f16: 462; VBITS_GE_256: // %bb.0: 463; VBITS_GE_256-NEXT: ptrue p0.d, vl4 464; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 465; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 466; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 467; VBITS_GE_256-NEXT: ptrue p0.d 468; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d 469; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d 470; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 471; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 472; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 473; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 474; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 475; VBITS_GE_256-NEXT: str q1, [x1] 476; VBITS_GE_256-NEXT: ret 477; 478; VBITS_GE_512-LABEL: fcvt_v8f64_v8f16: 479; VBITS_GE_512: // %bb.0: 480; VBITS_GE_512-NEXT: ptrue p0.d, vl8 481; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 482; VBITS_GE_512-NEXT: fcvt z0.h, p0/m, z0.d 483; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x1] 484; VBITS_GE_512-NEXT: ret 485 %op1 = load <8 x double>, ptr %a 486 %res = fptrunc <8 x double> %op1 to <8 x half> 487 store <8 x half> %res, ptr %b 488 ret void 489} 490 491define void @fcvt_v16f64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 492; CHECK-LABEL: fcvt_v16f64_v16f16: 493; CHECK: // %bb.0: 494; CHECK-NEXT: ptrue p0.d, vl16 495; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 496; CHECK-NEXT: fcvt z0.h, p0/m, z0.d 497; CHECK-NEXT: st1h { z0.d }, p0, [x1] 498; CHECK-NEXT: ret 499 %op1 = load <16 x double>, ptr %a 500 %res = fptrunc <16 x double> %op1 to <16 x half> 501 store <16 x half> %res, ptr %b 502 ret void 503} 504 505define void @fcvt_v32f64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 506; CHECK-LABEL: fcvt_v32f64_v32f16: 507; CHECK: // %bb.0: 508; CHECK-NEXT: ptrue p0.d, vl32 509; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 510; CHECK-NEXT: fcvt z0.h, p0/m, z0.d 511; CHECK-NEXT: st1h { z0.d }, p0, [x1] 512; CHECK-NEXT: ret 513 %op1 = load <32 x double>, ptr %a 514 %res = fptrunc <32 x double> %op1 to <32 x half> 515 store <32 x half> %res, ptr %b 516 ret void 517} 518 519; 520; FCVT D -> S 521; 522 523; Don't use SVE for 64-bit vectors. 524define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) vscale_range(2,0) #0 { 525; CHECK-LABEL: fcvt_v1f64_v1f32: 526; CHECK: // %bb.0: 527; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 528; CHECK-NEXT: fcvtn v0.2s, v0.2d 529; CHECK-NEXT: str s0, [x0] 530; CHECK-NEXT: ret 531 %res = fptrunc <1 x double> %op1 to <1 x float> 532 store <1 x float> %res, ptr %b 533 ret void 534} 535 536; Don't use SVE for 128-bit vectors. 537define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) vscale_range(2,0) #0 { 538; CHECK-LABEL: fcvt_v2f64_v2f32: 539; CHECK: // %bb.0: 540; CHECK-NEXT: fcvtn v0.2s, v0.2d 541; CHECK-NEXT: str d0, [x0] 542; CHECK-NEXT: ret 543 %res = fptrunc <2 x double> %op1 to <2 x float> 544 store <2 x float> %res, ptr %b 545 ret void 546} 547 548define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 549; CHECK-LABEL: fcvt_v4f64_v4f32: 550; CHECK: // %bb.0: 551; CHECK-NEXT: ptrue p0.d, vl4 552; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 553; CHECK-NEXT: fcvt z0.s, p0/m, z0.d 554; CHECK-NEXT: st1w { z0.d }, p0, [x1] 555; CHECK-NEXT: ret 556 %op1 = load <4 x double>, ptr %a 557 %res = fptrunc <4 x double> %op1 to <4 x float> 558 store <4 x float> %res, ptr %b 559 ret void 560} 561 562define void @fcvt_v8f64_v8f32(ptr %a, ptr %b) #0 { 563; VBITS_GE_256-LABEL: fcvt_v8f64_v8f32: 564; VBITS_GE_256: // %bb.0: 565; VBITS_GE_256-NEXT: ptrue p0.d, vl4 566; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 567; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 568; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 569; VBITS_GE_256-NEXT: fcvt z0.s, p0/m, z0.d 570; VBITS_GE_256-NEXT: fcvt z1.s, p0/m, z1.d 571; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [x1, x8, lsl #2] 572; VBITS_GE_256-NEXT: st1w { z1.d }, p0, [x1] 573; VBITS_GE_256-NEXT: ret 574; 575; VBITS_GE_512-LABEL: fcvt_v8f64_v8f32: 576; VBITS_GE_512: // %bb.0: 577; VBITS_GE_512-NEXT: ptrue p0.d, vl8 578; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 579; VBITS_GE_512-NEXT: fcvt z0.s, p0/m, z0.d 580; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] 581; VBITS_GE_512-NEXT: ret 582 %op1 = load <8 x double>, ptr %a 583 %res = fptrunc <8 x double> %op1 to <8 x float> 584 store <8 x float> %res, ptr %b 585 ret void 586} 587 588define void @fcvt_v16f64_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 589; CHECK-LABEL: fcvt_v16f64_v16f32: 590; CHECK: // %bb.0: 591; CHECK-NEXT: ptrue p0.d, vl16 592; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 593; CHECK-NEXT: fcvt z0.s, p0/m, z0.d 594; CHECK-NEXT: st1w { z0.d }, p0, [x1] 595; CHECK-NEXT: ret 596 %op1 = load <16 x double>, ptr %a 597 %res = fptrunc <16 x double> %op1 to <16 x float> 598 store <16 x float> %res, ptr %b 599 ret void 600} 601 602define void @fcvt_v32f64_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 603; CHECK-LABEL: fcvt_v32f64_v32f32: 604; CHECK: // %bb.0: 605; CHECK-NEXT: ptrue p0.d, vl32 606; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 607; CHECK-NEXT: fcvt z0.s, p0/m, z0.d 608; CHECK-NEXT: st1w { z0.d }, p0, [x1] 609; CHECK-NEXT: ret 610 %op1 = load <32 x double>, ptr %a 611 %res = fptrunc <32 x double> %op1 to <32 x float> 612 store <32 x float> %res, ptr %b 613 ret void 614} 615 616attributes #0 = { "target-features"="+sve" } 617