1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s 3 4; 5; Move Multi-Vector To Tile (Write) x 2 6; 7 8; Horizontal 9 10define void @za_write_vg2_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) { 11; CHECK-LABEL: za_write_vg2_horiz_b: 12; CHECK: // %bb.0: 13; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 14; CHECK-NEXT: mov w12, w0 15; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 16; CHECK-NEXT: mov za0h.b[w12, 0:1], { z0.b, z1.b } 17; CHECK-NEXT: mov za0h.b[w12, 14:15], { z0.b, z1.b } 18; CHECK-NEXT: ret 19 call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 20 %slice.14 = add i32 %slice, 14 21 call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 22 ret void 23} 24 25define void @za_write_vg2_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) { 26; CHECK-LABEL: za_write_vg2_horiz_h: 27; CHECK: // %bb.0: 28; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 29; CHECK-NEXT: mov w12, w0 30; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 31; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } 32; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } 33; CHECK-NEXT: ret 34 call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 35 %slice.6 = add i32 %slice, 6 36 call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 37 ret void 38} 39 40define void @za_write_vg2_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) { 41; CHECK-LABEL: za_write_vg2_horiz_f16: 42; CHECK: // %bb.0: 43; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 44; CHECK-NEXT: mov w12, w0 45; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 46; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } 47; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } 48; CHECK-NEXT: ret 49 call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) 50 %slice.6 = add i32 %slice, 6 51 call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) 52 ret void 53} 54 55define void @za_write_vg2_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) { 56; CHECK-LABEL: za_write_vg2_horiz_bf16: 57; CHECK: // %bb.0: 58; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 59; CHECK-NEXT: mov w12, w0 60; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 61; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } 62; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } 63; CHECK-NEXT: ret 64 call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) 65 %slice.6 = add i32 %slice, 6 66 call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) 67 ret void 68} 69 70define void @za_write_vg2_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) { 71; CHECK-LABEL: za_write_vg2_horiz_s: 72; CHECK: // %bb.0: 73; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 74; CHECK-NEXT: mov w12, w0 75; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 76; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } 77; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } 78; CHECK-NEXT: ret 79 call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) 80 %slice.2 = add i32 %slice, 2 81 call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) 82 ret void 83} 84 85define void @za_write_vg2_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) { 86; CHECK-LABEL: za_write_vg2_horiz_f32: 87; CHECK: // %bb.0: 88; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 89; CHECK-NEXT: mov w12, w0 90; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 91; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } 92; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } 93; CHECK-NEXT: ret 94 call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) 95 %slice.2 = add i32 %slice, 2 96 call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) 97 ret void 98} 99 100define void @za_write_vg2_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) { 101; CHECK-LABEL: za_write_vg2_horiz_d: 102; CHECK: // %bb.0: 103; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 104; CHECK-NEXT: mov w12, w0 105; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 106; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } 107; CHECK-NEXT: ret 108 call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) 109 ret void 110} 111 112define void @za_write_vg2_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) { 113; CHECK-LABEL: za_write_vg2_horiz_f64: 114; CHECK: // %bb.0: 115; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 116; CHECK-NEXT: mov w12, w0 117; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 118; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } 119; CHECK-NEXT: ret 120 call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) 121 ret void 122} 123 124; Vertical 125 126define void @za_write_vg2_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) { 127; CHECK-LABEL: za_write_vg2_vert_b: 128; CHECK: // %bb.0: 129; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 130; CHECK-NEXT: mov w12, w0 131; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 132; CHECK-NEXT: mov za0v.b[w12, 0:1], { z0.b, z1.b } 133; CHECK-NEXT: mov za0v.b[w12, 14:15], { z0.b, z1.b } 134; CHECK-NEXT: ret 135 call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 136 %slice.14 = add i32 %slice, 14 137 call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice.14, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) 138 ret void 139} 140 141define void @za_write_vg2_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) { 142; CHECK-LABEL: za_write_vg2_vert_h: 143; CHECK: // %bb.0: 144; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 145; CHECK-NEXT: mov w12, w0 146; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 147; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } 148; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } 149; CHECK-NEXT: ret 150 call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 151 %slice.6 = add i32 %slice, 6 152 call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 %slice.6, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) 153 ret void 154} 155 156define void @za_write_vg2_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) { 157; CHECK-LABEL: za_write_vg2_vert_f16: 158; CHECK: // %bb.0: 159; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 160; CHECK-NEXT: mov w12, w0 161; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 162; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } 163; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } 164; CHECK-NEXT: ret 165 call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) 166 %slice.6 = add i32 %slice, 6 167 call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 %slice.6, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2) 168 ret void 169} 170 171define void @za_write_vg2_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) { 172; CHECK-LABEL: za_write_vg2_vert_bf16: 173; CHECK: // %bb.0: 174; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 175; CHECK-NEXT: mov w12, w0 176; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 177; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } 178; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } 179; CHECK-NEXT: ret 180 call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) 181 %slice.6 = add i32 %slice, 6 182 call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 %slice.6, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2) 183 ret void 184} 185 186define void @za_write_vg2_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) { 187; CHECK-LABEL: za_write_vg2_vert_s: 188; CHECK: // %bb.0: 189; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 190; CHECK-NEXT: mov w12, w0 191; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 192; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } 193; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } 194; CHECK-NEXT: ret 195 call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) 196 %slice.2 = add i32 %slice, 2 197 call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 %slice.2, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2) 198 ret void 199} 200 201define void @za_write_vg2_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) { 202; CHECK-LABEL: za_write_vg2_vert_f32: 203; CHECK: // %bb.0: 204; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 205; CHECK-NEXT: mov w12, w0 206; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 207; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } 208; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } 209; CHECK-NEXT: ret 210 call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) 211 %slice.2 = add i32 %slice, 2 212 call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 %slice.2, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2) 213 ret void 214} 215 216define void @za_write_vg2_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) { 217; CHECK-LABEL: za_write_vg2_vert_d: 218; CHECK: // %bb.0: 219; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 220; CHECK-NEXT: mov w12, w0 221; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 222; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } 223; CHECK-NEXT: ret 224 call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2) 225 ret void 226} 227 228define void @za_write_vg2_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) { 229; CHECK-LABEL: za_write_vg2_vert_f64: 230; CHECK: // %bb.0: 231; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 232; CHECK-NEXT: mov w12, w0 233; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 234; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } 235; CHECK-NEXT: ret 236 call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2) 237 ret void 238} 239 240; 241; Move Multi-Vector To Tile (Write) x 4 242; 243 244; Horizontal 245 246define void @za_write_vg4_horiz_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) { 247; CHECK-LABEL: za_write_vg4_horiz_b: 248; CHECK: // %bb.0: 249; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 250; CHECK-NEXT: mov w12, w0 251; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 252; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 253; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 254; CHECK-NEXT: mov za0h.b[w12, 0:3], { z0.b - z3.b } 255; CHECK-NEXT: mov za0h.b[w12, 12:15], { z0.b - z3.b } 256; CHECK-NEXT: ret 257 call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 258 %slice.12 = add i32 %slice, 12 259 call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 260 ret void 261} 262 263define void @za_write_vg4_horiz_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) { 264; CHECK-LABEL: za_write_vg4_horiz_h: 265; CHECK: // %bb.0: 266; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 267; CHECK-NEXT: mov w12, w0 268; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 269; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 270; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 271; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } 272; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } 273; CHECK-NEXT: ret 274 call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 275 %slice.4 = add i32 %slice, 4 276 call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 277 ret void 278} 279 280define void @za_write_vg4_horiz_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) { 281; CHECK-LABEL: za_write_vg4_horiz_f16: 282; CHECK: // %bb.0: 283; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 284; CHECK-NEXT: mov w12, w0 285; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 286; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 287; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 288; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } 289; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } 290; CHECK-NEXT: ret 291 call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) 292 %slice.4 = add i32 %slice, 4 293 call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) 294 ret void 295} 296 297define void @za_write_vg4_horiz_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) { 298; CHECK-LABEL: za_write_vg4_horiz_bf16: 299; CHECK: // %bb.0: 300; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 301; CHECK-NEXT: mov w12, w0 302; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 303; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 304; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 305; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } 306; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } 307; CHECK-NEXT: ret 308 call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) 309 %slice.4 = add i32 %slice, 4 310 call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) 311 ret void 312} 313 314define void @za_write_vg4_horiz_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) { 315; CHECK-LABEL: za_write_vg4_horiz_s: 316; CHECK: // %bb.0: 317; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 318; CHECK-NEXT: mov w12, w0 319; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 320; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 321; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 322; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } 323; CHECK-NEXT: ret 324 call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) 325 ret void 326} 327 328define void @za_write_vg4_horiz_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) { 329; CHECK-LABEL: za_write_vg4_horiz_f32: 330; CHECK: // %bb.0: 331; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 332; CHECK-NEXT: mov w12, w0 333; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 334; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 335; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 336; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } 337; CHECK-NEXT: ret 338 call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) 339 ret void 340} 341 342define void @za_write_vg4_horiz_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) { 343; CHECK-LABEL: za_write_vg4_horiz_d: 344; CHECK: // %bb.0: 345; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 346; CHECK-NEXT: mov w12, w0 347; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 348; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 349; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 350; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } 351; CHECK-NEXT: ret 352 call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) 353 ret void 354} 355 356define void @za_write_vg4_horiz_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) { 357; CHECK-LABEL: za_write_vg4_horiz_f64: 358; CHECK: // %bb.0: 359; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 360; CHECK-NEXT: mov w12, w0 361; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 362; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 363; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 364; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } 365; CHECK-NEXT: ret 366 call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) 367 ret void 368} 369 370; Vertical 371 372define void @za_write_vg4_vert_b(i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) { 373; CHECK-LABEL: za_write_vg4_vert_b: 374; CHECK: // %bb.0: 375; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 376; CHECK-NEXT: mov w12, w0 377; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 378; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 379; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 380; CHECK-NEXT: mov za0v.b[w12, 0:3], { z0.b - z3.b } 381; CHECK-NEXT: mov za0v.b[w12, 12:15], { z0.b - z3.b } 382; CHECK-NEXT: ret 383 call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 384 %slice.12 = add i32 %slice, 12 385 call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice.12, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) 386 ret void 387} 388 389define void @za_write_vg4_vert_h(i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) { 390; CHECK-LABEL: za_write_vg4_vert_h: 391; CHECK: // %bb.0: 392; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 393; CHECK-NEXT: mov w12, w0 394; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 395; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 396; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 397; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } 398; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } 399; CHECK-NEXT: ret 400 call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 0, i32 %slice, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 401 %slice.4 = add i32 %slice, 4 402 call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 %slice.4, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) 403 ret void 404} 405 406define void @za_write_vg4_vert_f16(i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) { 407; CHECK-LABEL: za_write_vg4_vert_f16: 408; CHECK: // %bb.0: 409; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 410; CHECK-NEXT: mov w12, w0 411; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 412; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 413; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 414; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } 415; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } 416; CHECK-NEXT: ret 417 call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 0, i32 %slice, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) 418 %slice.4 = add i32 %slice, 4 419 call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 %slice.4, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zn4) 420 ret void 421} 422 423define void @za_write_vg4_vert_bf16(i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) { 424; CHECK-LABEL: za_write_vg4_vert_bf16: 425; CHECK: // %bb.0: 426; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 427; CHECK-NEXT: mov w12, w0 428; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 429; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 430; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 431; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } 432; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } 433; CHECK-NEXT: ret 434 call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 0, i32 %slice, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) 435 %slice.4 = add i32 %slice, 4 436 call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 %slice.4, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zn4) 437 ret void 438} 439 440define void @za_write_vg4_vert_s(i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) { 441; CHECK-LABEL: za_write_vg4_vert_s: 442; CHECK: // %bb.0: 443; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 444; CHECK-NEXT: mov w12, w0 445; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 446; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 447; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 448; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } 449; CHECK-NEXT: ret 450 call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, <vscale x 4 x i32> %zn4) 451 ret void 452} 453 454define void @za_write_vg4_vert_f32(i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) { 455; CHECK-LABEL: za_write_vg4_vert_f32: 456; CHECK: // %bb.0: 457; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 458; CHECK-NEXT: mov w12, w0 459; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 460; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 461; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 462; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } 463; CHECK-NEXT: ret 464 call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn4) 465 ret void 466} 467 468define void @za_write_vg4_vert_d(i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) { 469; CHECK-LABEL: za_write_vg4_vert_d: 470; CHECK: // %bb.0: 471; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 472; CHECK-NEXT: mov w12, w0 473; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 474; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 475; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 476; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } 477; CHECK-NEXT: ret 478 call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, <vscale x 2 x i64> %zn4) 479 ret void 480} 481 482define void @za_write_vg4_vert_f64(i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) { 483; CHECK-LABEL: za_write_vg4_vert_f64: 484; CHECK: // %bb.0: 485; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 486; CHECK-NEXT: mov w12, w0 487; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 488; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 489; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 490; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } 491; CHECK-NEXT: ret 492 call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, <vscale x 2 x double> %zn4) 493 ret void 494} 495 496; 497; Move Multi-Vector To ZA (Write) x2 498; 499 500define void @za_write_vg1x2_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) { 501; CHECK-LABEL: za_write_vg1x2_b: 502; CHECK: // %bb.0: 503; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 504; CHECK-NEXT: mov w8, w0 505; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 506; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 507; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 508; CHECK-NEXT: ret 509 call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) 510 %slice.7 = add i32 %slice, 7 511 call void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2) 512 ret void 513} 514 515define void @za_write_vg1x2_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) { 516; CHECK-LABEL: za_write_vg1x2_h: 517; CHECK: // %bb.0: 518; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 519; CHECK-NEXT: mov w8, w0 520; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 521; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 522; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 523; CHECK-NEXT: ret 524 call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) 525 %slice.7 = add i32 %slice, 7 526 call void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2) 527 ret void 528} 529 530define void @za_write_vg1x2_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) { 531; CHECK-LABEL: za_write_vg1x2_f16: 532; CHECK: // %bb.0: 533; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 534; CHECK-NEXT: mov w8, w0 535; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 536; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 537; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 538; CHECK-NEXT: ret 539 call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) 540 %slice.7 = add i32 %slice, 7 541 call void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2) 542 ret void 543} 544 545define void @za_write_vg1x2_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) { 546; CHECK-LABEL: za_write_vg1x2_bf16: 547; CHECK: // %bb.0: 548; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 549; CHECK-NEXT: mov w8, w0 550; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 551; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 552; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 553; CHECK-NEXT: ret 554 call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) 555 %slice.7 = add i32 %slice, 7 556 call void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2) 557 ret void 558} 559 560define void @za_write_vg1x2_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) { 561; CHECK-LABEL: za_write_vg1x2_s: 562; CHECK: // %bb.0: 563; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 564; CHECK-NEXT: mov w8, w0 565; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 566; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 567; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 568; CHECK-NEXT: ret 569 call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) 570 %slice.7 = add i32 %slice, 7 571 call void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2) 572 ret void 573} 574 575define void @za_write_vg1x2_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) { 576; CHECK-LABEL: za_write_vg1x2_f32: 577; CHECK: // %bb.0: 578; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 579; CHECK-NEXT: mov w8, w0 580; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 581; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 582; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 583; CHECK-NEXT: ret 584 call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) 585 %slice.7 = add i32 %slice, 7 586 call void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2) 587 ret void 588} 589 590define void @za_write_vg1x2_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) { 591; CHECK-LABEL: za_write_vg1x2_d: 592; CHECK: // %bb.0: 593; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 594; CHECK-NEXT: mov w8, w0 595; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 596; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 597; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 598; CHECK-NEXT: ret 599 call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) 600 %slice.7 = add i32 %slice, 7 601 call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2) 602 ret void 603} 604 605define void @za_write_vg1x2_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) { 606; CHECK-LABEL: za_write_vg1x2_f64: 607; CHECK: // %bb.0: 608; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 609; CHECK-NEXT: mov w8, w0 610; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 611; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } 612; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } 613; CHECK-NEXT: ret 614 call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) 615 %slice.7 = add i32 %slice, 7 616 call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2) 617 ret void 618} 619 620; 621; Move Multi-Vector To ZA (Write) x4 622; 623 624define void @za_write_vg1x4_b(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) { 625; CHECK-LABEL: za_write_vg1x4_b: 626; CHECK: // %bb.0: 627; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 628; CHECK-NEXT: mov w8, w0 629; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 630; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 631; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 632; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 633; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 634; CHECK-NEXT: ret 635 call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) 636 %slice.7 = add i32 %slice, 7 637 call void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32 %slice.7, <vscale x 16 x i8> %za1, <vscale x 16 x i8> %za2, <vscale x 16 x i8> %za3, <vscale x 16 x i8> %za4) 638 ret void 639} 640 641define void @za_write_vg1x4_h(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) { 642; CHECK-LABEL: za_write_vg1x4_h: 643; CHECK: // %bb.0: 644; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 645; CHECK-NEXT: mov w8, w0 646; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 647; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 648; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 649; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 650; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 651; CHECK-NEXT: ret 652 call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) 653 %slice.7 = add i32 %slice, 7 654 call void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32 %slice.7, <vscale x 8 x i16> %za1, <vscale x 8 x i16> %za2, <vscale x 8 x i16> %za3, <vscale x 8 x i16> %za4) 655 ret void 656} 657 658define void @za_write_vg1x4_f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) { 659; CHECK-LABEL: za_write_vg1x4_f16: 660; CHECK: // %bb.0: 661; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 662; CHECK-NEXT: mov w8, w0 663; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 664; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 665; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 666; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 667; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 668; CHECK-NEXT: ret 669 call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) 670 %slice.7 = add i32 %slice, 7 671 call void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32 %slice.7, <vscale x 8 x half> %za1, <vscale x 8 x half> %za2, <vscale x 8 x half> %za3, <vscale x 8 x half> %za4) 672 ret void 673} 674 675define void @za_write_vg1x4_bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) { 676; CHECK-LABEL: za_write_vg1x4_bf16: 677; CHECK: // %bb.0: 678; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 679; CHECK-NEXT: mov w8, w0 680; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 681; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 682; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 683; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 684; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 685; CHECK-NEXT: ret 686 call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) 687 %slice.7 = add i32 %slice, 7 688 call void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32 %slice.7, <vscale x 8 x bfloat> %za1, <vscale x 8 x bfloat> %za2, <vscale x 8 x bfloat> %za3, <vscale x 8 x bfloat> %za4) 689 ret void 690} 691 692define void @za_write_vg1x4_s(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) { 693; CHECK-LABEL: za_write_vg1x4_s: 694; CHECK: // %bb.0: 695; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 696; CHECK-NEXT: mov w8, w0 697; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 698; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 699; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 700; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 701; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 702; CHECK-NEXT: ret 703 call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) 704 %slice.7 = add i32 %slice, 7 705 call void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %za1, <vscale x 4 x i32> %za2, <vscale x 4 x i32> %za3, <vscale x 4 x i32> %za4) 706 ret void 707} 708 709define void @za_write_vg1x4_f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) { 710; CHECK-LABEL: za_write_vg1x4_f32: 711; CHECK: // %bb.0: 712; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 713; CHECK-NEXT: mov w8, w0 714; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 715; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 716; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 717; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 718; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 719; CHECK-NEXT: ret 720 call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) 721 %slice.7 = add i32 %slice, 7 722 call void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32 %slice.7, <vscale x 4 x float> %za1, <vscale x 4 x float> %za2, <vscale x 4 x float> %za3, <vscale x 4 x float> %za4) 723 ret void 724} 725 726define void @za_write_vg1x4_d(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) { 727; CHECK-LABEL: za_write_vg1x4_d: 728; CHECK: // %bb.0: 729; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 730; CHECK-NEXT: mov w8, w0 731; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 732; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 733; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 734; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 735; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 736; CHECK-NEXT: ret 737 call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) 738 %slice.7 = add i32 %slice, 7 739 call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %za1, <vscale x 2 x i64> %za2, <vscale x 2 x i64> %za3, <vscale x 2 x i64> %za4) 740 ret void 741} 742 743define void @za_write_vg1x4_f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) { 744; CHECK-LABEL: za_write_vg1x4_f64: 745; CHECK: // %bb.0: 746; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 747; CHECK-NEXT: mov w8, w0 748; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 749; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 750; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 751; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } 752; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } 753; CHECK-NEXT: ret 754 call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) 755 %slice.7 = add i32 %slice, 7 756 call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice.7, <vscale x 2 x double> %za1, <vscale x 2 x double> %za2, <vscale x 2 x double> %za3, <vscale x 2 x double> %za4) 757 ret void 758} 759 760declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 761declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 762declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>) 763declare void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 764declare void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>) 765declare void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>) 766declare void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>) 767declare void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>) 768 769declare void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 770declare void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 771declare void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>) 772declare void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 773declare void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>) 774declare void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>) 775declare void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>) 776declare void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>) 777 778declare void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 779declare void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 780declare void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 781declare void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 782declare void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 783declare void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 784declare void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 785declare void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 786 787declare void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32, i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 788declare void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32, i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 789declare void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32, i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 790declare void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32, i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 791declare void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32, i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 792declare void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32, i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 793declare void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32, i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 794declare void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32, i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 795 796declare void @llvm.aarch64.sme.write.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>) 797declare void @llvm.aarch64.sme.write.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>) 798declare void @llvm.aarch64.sme.write.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>) 799declare void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>) 800declare void @llvm.aarch64.sme.write.vg1x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>) 801declare void @llvm.aarch64.sme.write.vg1x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 802declare void @llvm.aarch64.sme.write.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>) 803declare void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>) 804 805declare void @llvm.aarch64.sme.write.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>) 806declare void @llvm.aarch64.sme.write.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>) 807declare void @llvm.aarch64.sme.write.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>) 808declare void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>) 809declare void @llvm.aarch64.sme.write.vg1x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>) 810declare void @llvm.aarch64.sme.write.vg1x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>) 811declare void @llvm.aarch64.sme.write.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 812declare void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>) 813