1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s 3 4; == 8 to 64-bit elements == 5 6define <vscale x 16 x i8> @uzp_x2_i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) nounwind { 7; CHECK-LABEL: uzp_x2_i8: 8; CHECK: // %bb.0: 9; CHECK-NEXT: uzp { z2.b, z3.b }, z0.b, z1.b 10; CHECK-NEXT: add z0.b, z2.b, z0.b 11; CHECK-NEXT: ret 12 %uzp = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 13 %uzp0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %uzp, 0 14 %add = add <vscale x 16 x i8> %uzp0, %zn 15 ret <vscale x 16 x i8> %add 16} 17 18define <vscale x 8 x i16> @uzp_x2_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) nounwind { 19; CHECK-LABEL: uzp_x2_i16: 20; CHECK: // %bb.0: 21; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h 22; CHECK-NEXT: add z0.h, z2.h, z0.h 23; CHECK-NEXT: ret 24 %uzp = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 25 %uzp0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %uzp, 0 26 %add = add <vscale x 8 x i16> %uzp0, %zn 27 ret <vscale x 8 x i16> %add 28} 29 30define <vscale x 8 x half> @uzp_x2_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) nounwind { 31; CHECK-LABEL: uzp_x2_f16: 32; CHECK: // %bb.0: 33; CHECK-NEXT: uzp { z2.h, z3.h }, z0.h, z1.h 34; CHECK-NEXT: fadd z0.h, z2.h, z0.h 35; CHECK-NEXT: ret 36 %uzp = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x2.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 37 %uzp0 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %uzp, 0 38 %add = fadd <vscale x 8 x half> %uzp0, %zn 39 ret <vscale x 8 x half> %add 40} 41 42define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @uzp_x2_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) nounwind { 43; CHECK-LABEL: uzp_x2_bf16: 44; CHECK: // %bb.0: 45; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h 46; CHECK-NEXT: ret 47 %uzp = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x2.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 48 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %uzp 49} 50 51define <vscale x 4 x i32> @uzp_x2_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) nounwind { 52; CHECK-LABEL: uzp_x2_i32: 53; CHECK: // %bb.0: 54; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s 55; CHECK-NEXT: add z0.s, z2.s, z0.s 56; CHECK-NEXT: ret 57 %uzp = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) 58 %uzp0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %uzp, 0 59 %add = add <vscale x 4 x i32> %uzp0, %zn 60 ret <vscale x 4 x i32> %add 61} 62 63define <vscale x 4 x float> @uzp_x2_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) nounwind { 64; CHECK-LABEL: uzp_x2_f32: 65; CHECK: // %bb.0: 66; CHECK-NEXT: uzp { z2.s, z3.s }, z0.s, z1.s 67; CHECK-NEXT: fadd z0.s, z2.s, z0.s 68; CHECK-NEXT: ret 69 %uzp = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x2.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) 70 %uzp0 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %uzp, 0 71 %add = fadd <vscale x 4 x float> %uzp0, %zn 72 ret <vscale x 4 x float> %add 73} 74 75define <vscale x 2 x i64> @uzp_x2_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) nounwind { 76; CHECK-LABEL: uzp_x2_i64: 77; CHECK: // %bb.0: 78; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d 79; CHECK-NEXT: add z0.d, z2.d, z0.d 80; CHECK-NEXT: ret 81 %uzp = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) 82 %uzp0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %uzp, 0 83 %add = add <vscale x 2 x i64> %uzp0, %zn 84 ret <vscale x 2 x i64> %add 85} 86 87define <vscale x 2 x double> @uzp_x2_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) nounwind { 88; CHECK-LABEL: uzp_x2_f64: 89; CHECK: // %bb.0: 90; CHECK-NEXT: uzp { z2.d, z3.d }, z0.d, z1.d 91; CHECK-NEXT: fadd z0.d, z2.d, z0.d 92; CHECK-NEXT: ret 93 %uzp = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x2.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) 94 %uzp0 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %uzp, 0 95 %add = fadd <vscale x 2 x double> %uzp0, %zn 96 ret <vscale x 2 x double> %add 97} 98 99 100; == 128-bit elements == 101 102; NOTE: For the 128-bit case we only need to check the <vscale x 16 x i8> to 103; ensure the tuple result starts at the correct register multiple. The other 104; variants all test the same code path. 105define <vscale x 16 x i8> @uzpq_x2_i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) nounwind { 106; CHECK-LABEL: uzpq_x2_i8: 107; CHECK: // %bb.0: 108; CHECK-NEXT: uzp { z2.q, z3.q }, z0.q, z1.q 109; CHECK-NEXT: add z0.b, z2.b, z0.b 110; CHECK-NEXT: ret 111 %uzp = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 112 %uzp0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %uzp, 0 113 %add = add <vscale x 16 x i8> %uzp0, %zn 114 ret <vscale x 16 x i8> %add 115} 116 117define { <vscale x 8 x i16>, <vscale x 8 x i16> } @uzpq_x2_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) nounwind { 118; CHECK-LABEL: uzpq_x2_i16: 119; CHECK: // %bb.0: 120; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 121; CHECK-NEXT: ret 122 %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 123 ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res 124} 125 126define { <vscale x 8 x half>, <vscale x 8 x half> } @uzpq_x2_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) nounwind { 127; CHECK-LABEL: uzpq_x2_f16: 128; CHECK: // %bb.0: 129; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 130; CHECK-NEXT: ret 131 %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x2.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 132 ret { <vscale x 8 x half>, <vscale x 8 x half> } %res 133} 134 135define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @uzpq_x2_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) nounwind { 136; CHECK-LABEL: uzpq_x2_bf16: 137; CHECK: // %bb.0: 138; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 139; CHECK-NEXT: ret 140 %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x2.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 141 ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res 142} 143 144define { <vscale x 4 x i32>, <vscale x 4 x i32> } @uzpq_x2_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) nounwind { 145; CHECK-LABEL: uzpq_x2_i32: 146; CHECK: // %bb.0: 147; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 148; CHECK-NEXT: ret 149 %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) 150 ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res 151} 152 153define { <vscale x 4 x float>, <vscale x 4 x float> } @uzpq_x2_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) nounwind { 154; CHECK-LABEL: uzpq_x2_f32: 155; CHECK: // %bb.0: 156; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 157; CHECK-NEXT: ret 158 %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x2.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) 159 ret { <vscale x 4 x float>, <vscale x 4 x float> } %res 160} 161 162define { <vscale x 2 x i64>, <vscale x 2 x i64> } @uzpq_x2_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) nounwind { 163; CHECK-LABEL: uzpq_x2_i64: 164; CHECK: // %bb.0: 165; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 166; CHECK-NEXT: ret 167 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) 168 ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res 169} 170 171define { <vscale x 2 x double>, <vscale x 2 x double> } @uzpq_x2_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) nounwind { 172; CHECK-LABEL: uzpq_x2_f64: 173; CHECK: // %bb.0: 174; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q 175; CHECK-NEXT: ret 176 %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x2.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) 177 ret { <vscale x 2 x double>, <vscale x 2 x double> } %res 178} 179 180define { <vscale x 16 x i8>, <vscale x 16 x i8> } @uzpq_x2_i8_not_tied(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) nounwind { 181; CHECK-LABEL: uzpq_x2_i8_not_tied: 182; CHECK: // %bb.0: 183; CHECK-NEXT: uzp { z0.q, z1.q }, z1.q, z2.q 184; CHECK-NEXT: ret 185 %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 186 ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res 187} 188 189 190; == 8 to 64-bit elements == 191declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzp.x2.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 192declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzp.x2.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 193declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzp.x2.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) 194declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzp.x2.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) 195declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzp.x2.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 196declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzp.x2.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 197declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzp.x2.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) 198declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzp.x2.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) 199 200; == 128-bit elements == 201declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.uzpq.x2.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) 202declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.uzpq.x2.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) 203declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.uzpq.x2.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) 204declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.uzpq.x2.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) 205declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.uzpq.x2.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) 206declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.uzpq.x2.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) 207declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.uzpq.x2.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) 208declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.uzpq.x2.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) 209