1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s 3 4define void @ld1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) { 5; CHECK-LABEL: ld1b: 6; CHECK: // %bb.0: 7; CHECK-NEXT: mov w12, w1 8; CHECK-NEXT: mov w13, wzr 9; CHECK-NEXT: ld1b {za0h.b[w12, 15]}, p0/z, [x0] 10; CHECK-NEXT: ld1b {za0v.b[w13, 0]}, p0/z, [x0] 11; CHECK-NEXT: ret 12 %tileslice = add i32 %sliceidx, 15 13 call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice) 14 call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 0) 15 ret void; 16} 17 18define void @ld1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { 19; CHECK-LABEL: ld1b_with_addr_offset: 20; CHECK: // %bb.0: 21; CHECK-NEXT: mov w13, wzr 22; CHECK-NEXT: mov w12, w2 23; CHECK-NEXT: ld1b {za0h.b[w13, 0]}, p0/z, [x0, x1] 24; CHECK-NEXT: ld1b {za0v.b[w12, 15]}, p0/z, [x0, x1] 25; CHECK-NEXT: ret 26 %base = getelementptr i8, ptr %ptr, i64 %index 27 %tileslice = add i32 %sliceidx, 15 28 call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 0) 29 call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 %tileslice) 30 ret void; 31} 32 33define void @ld1h(<vscale x 8 x i1> %pg, ptr %ptr, i32 %sliceidx) { 34; CHECK-LABEL: ld1h: 35; CHECK: // %bb.0: 36; CHECK-NEXT: mov w12, w1 37; CHECK-NEXT: mov w13, wzr 38; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0] 39; CHECK-NEXT: ld1h {za1h.h[w13, 0]}, p0/z, [x0] 40; CHECK-NEXT: ld1h {za0v.h[w13, 0]}, p0/z, [x0] 41; CHECK-NEXT: ld1h {za1v.h[w12, 7]}, p0/z, [x0] 42; CHECK-NEXT: ret 43 %tileslice = add i32 %sliceidx, 7 44 call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice) 45 call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 0) 46 call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 0) 47 call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 %tileslice) 48 ret void; 49} 50 51define void @ld1h_with_addr_offset(<vscale x 8 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { 52; CHECK-LABEL: ld1h_with_addr_offset: 53; CHECK: // %bb.0: 54; CHECK-NEXT: mov w12, w2 55; CHECK-NEXT: mov w13, wzr 56; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0, x1, lsl #1] 57; CHECK-NEXT: ld1h {za1v.h[w13, 0]}, p0/z, [x0, x1, lsl #1] 58; CHECK-NEXT: ret 59 %base = getelementptr i16, ptr %ptr, i64 %index 60 %tileslice = add i32 %sliceidx, 7 61 call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %base, i32 0, i32 %tileslice) 62 call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %base, i32 1, i32 0) 63 ret void; 64} 65 66define void @ld1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) { 67; CHECK-LABEL: ld1w: 68; CHECK: // %bb.0: 69; CHECK-NEXT: mov w12, w1 70; CHECK-NEXT: mov w13, wzr 71; CHECK-NEXT: ld1w {za0h.s[w13, 0]}, p0/z, [x0] 72; CHECK-NEXT: ld1w {za1h.s[w13, 0]}, p0/z, [x0] 73; CHECK-NEXT: ld1w {za2h.s[w13, 0]}, p0/z, [x0] 74; CHECK-NEXT: ld1w {za3h.s[w12, 3]}, p0/z, [x0] 75; CHECK-NEXT: ld1w {za0v.s[w13, 0]}, p0/z, [x0] 76; CHECK-NEXT: ld1w {za1v.s[w13, 0]}, p0/z, [x0] 77; CHECK-NEXT: ld1w {za2v.s[w12, 3]}, p0/z, [x0] 78; CHECK-NEXT: ld1w {za3v.s[w13, 0]}, p0/z, [x0] 79; CHECK-NEXT: ret 80 %tileslice = add i32 %sliceidx, 3 81 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0) 82 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0) 83 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 0) 84 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 %tileslice) 85 call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0) 86 call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0) 87 call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 %tileslice) 88 call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 0) 89 ret void; 90} 91 92define void @ld1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { 93; CHECK-LABEL: ld1w_with_addr_offset: 94; CHECK: // %bb.0: 95; CHECK-NEXT: mov w12, w2 96; CHECK-NEXT: mov w13, wzr 97; CHECK-NEXT: ld1w {za0h.s[w13, 0]}, p0/z, [x0, x1, lsl #2] 98; CHECK-NEXT: ld1w {za3v.s[w12, 3]}, p0/z, [x0, x1, lsl #2] 99; CHECK-NEXT: ret 100 %base = getelementptr i32, ptr %ptr, i64 %index 101 %tileslice = add i32 %sliceidx, 3 102 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %base, i32 0, i32 0) 103 call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %base, i32 3, i32 %tileslice) 104 ret void; 105} 106 107define void @ld1d(<vscale x 2 x i1> %pg, ptr %ptr, i32 %sliceidx) { 108; CHECK-LABEL: ld1d: 109; CHECK: // %bb.0: 110; CHECK-NEXT: mov w12, w1 111; CHECK-NEXT: mov w13, wzr 112; CHECK-NEXT: ld1d {za0h.d[w13, 0]}, p0/z, [x0] 113; CHECK-NEXT: ld1d {za1h.d[w13, 0]}, p0/z, [x0] 114; CHECK-NEXT: ld1d {za2h.d[w13, 0]}, p0/z, [x0] 115; CHECK-NEXT: ld1d {za3h.d[w13, 0]}, p0/z, [x0] 116; CHECK-NEXT: ld1d {za4h.d[w12, 1]}, p0/z, [x0] 117; CHECK-NEXT: ld1d {za5h.d[w13, 0]}, p0/z, [x0] 118; CHECK-NEXT: ld1d {za6h.d[w13, 0]}, p0/z, [x0] 119; CHECK-NEXT: ld1d {za7h.d[w13, 0]}, p0/z, [x0] 120; CHECK-NEXT: ld1d {za0v.d[w13, 0]}, p0/z, [x0] 121; CHECK-NEXT: ld1d {za1v.d[w13, 0]}, p0/z, [x0] 122; CHECK-NEXT: ld1d {za2v.d[w13, 0]}, p0/z, [x0] 123; CHECK-NEXT: ld1d {za3v.d[w13, 0]}, p0/z, [x0] 124; CHECK-NEXT: ld1d {za4v.d[w13, 0]}, p0/z, [x0] 125; CHECK-NEXT: ld1d {za5v.d[w13, 0]}, p0/z, [x0] 126; CHECK-NEXT: ld1d {za6v.d[w13, 0]}, p0/z, [x0] 127; CHECK-NEXT: ld1d {za7v.d[w12, 1]}, p0/z, [x0] 128; CHECK-NEXT: ret 129 %tileslice = add i32 %sliceidx, 1 130 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0) 131 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0) 132 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0) 133 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0) 134 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 %tileslice) 135 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0) 136 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0) 137 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 0) 138 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0) 139 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0) 140 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0) 141 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0) 142 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 0) 143 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0) 144 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0) 145 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 %tileslice) 146 ret void; 147} 148 149define void @ld1d_with_addr_offset(<vscale x 2 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) { 150; CHECK-LABEL: ld1d_with_addr_offset: 151; CHECK: // %bb.0: 152; CHECK-NEXT: mov w12, w2 153; CHECK-NEXT: mov w13, wzr 154; CHECK-NEXT: ld1d {za0h.d[w12, 1]}, p0/z, [x0, x1, lsl #3] 155; CHECK-NEXT: ld1d {za7v.d[w13, 0]}, p0/z, [x0, x1, lsl #3] 156; CHECK-NEXT: ret 157 %base = getelementptr i64, ptr %ptr, i64 %index 158 %tileslice = add i32 %sliceidx, 1 159 call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %base, i32 0, i32 %tileslice) 160 call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %base, i32 7, i32 0) 161 ret void; 162} 163 164define void @ld1q(<vscale x 1 x i1> %pg, ptr %ptr) { 165; CHECK-LABEL: ld1q: 166; CHECK: // %bb.0: 167; CHECK-NEXT: mov w12, wzr 168; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0] 169; CHECK-NEXT: ld1q {za1h.q[w12, 0]}, p0/z, [x0] 170; CHECK-NEXT: ld1q {za2h.q[w12, 0]}, p0/z, [x0] 171; CHECK-NEXT: ld1q {za3h.q[w12, 0]}, p0/z, [x0] 172; CHECK-NEXT: ld1q {za4h.q[w12, 0]}, p0/z, [x0] 173; CHECK-NEXT: ld1q {za5h.q[w12, 0]}, p0/z, [x0] 174; CHECK-NEXT: ld1q {za6h.q[w12, 0]}, p0/z, [x0] 175; CHECK-NEXT: ld1q {za7h.q[w12, 0]}, p0/z, [x0] 176; CHECK-NEXT: ld1q {za8h.q[w12, 0]}, p0/z, [x0] 177; CHECK-NEXT: ld1q {za9h.q[w12, 0]}, p0/z, [x0] 178; CHECK-NEXT: ld1q {za10h.q[w12, 0]}, p0/z, [x0] 179; CHECK-NEXT: ld1q {za11h.q[w12, 0]}, p0/z, [x0] 180; CHECK-NEXT: ld1q {za12h.q[w12, 0]}, p0/z, [x0] 181; CHECK-NEXT: ld1q {za13h.q[w12, 0]}, p0/z, [x0] 182; CHECK-NEXT: ld1q {za14h.q[w12, 0]}, p0/z, [x0] 183; CHECK-NEXT: ld1q {za15h.q[w12, 0]}, p0/z, [x0] 184; CHECK-NEXT: ld1q {za0v.q[w12, 0]}, p0/z, [x0] 185; CHECK-NEXT: ld1q {za1v.q[w12, 0]}, p0/z, [x0] 186; CHECK-NEXT: ld1q {za2v.q[w12, 0]}, p0/z, [x0] 187; CHECK-NEXT: ld1q {za3v.q[w12, 0]}, p0/z, [x0] 188; CHECK-NEXT: ld1q {za4v.q[w12, 0]}, p0/z, [x0] 189; CHECK-NEXT: ld1q {za5v.q[w12, 0]}, p0/z, [x0] 190; CHECK-NEXT: ld1q {za6v.q[w12, 0]}, p0/z, [x0] 191; CHECK-NEXT: ld1q {za7v.q[w12, 0]}, p0/z, [x0] 192; CHECK-NEXT: ld1q {za8v.q[w12, 0]}, p0/z, [x0] 193; CHECK-NEXT: ld1q {za9v.q[w12, 0]}, p0/z, [x0] 194; CHECK-NEXT: ld1q {za10v.q[w12, 0]}, p0/z, [x0] 195; CHECK-NEXT: ld1q {za11v.q[w12, 0]}, p0/z, [x0] 196; CHECK-NEXT: ld1q {za12v.q[w12, 0]}, p0/z, [x0] 197; CHECK-NEXT: ld1q {za13v.q[w12, 0]}, p0/z, [x0] 198; CHECK-NEXT: ld1q {za14v.q[w12, 0]}, p0/z, [x0] 199; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0] 200; CHECK-NEXT: ret 201 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0) 202 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0) 203 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0) 204 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0) 205 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0) 206 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0) 207 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0) 208 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0) 209 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0) 210 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0) 211 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0) 212 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0) 213 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0) 214 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0) 215 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0) 216 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0) 217 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0) 218 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0) 219 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0) 220 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0) 221 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0) 222 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0) 223 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0) 224 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0) 225 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0) 226 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0) 227 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0) 228 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0) 229 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0) 230 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0) 231 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0) 232 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0) 233 ret void; 234} 235 236define void @ld1q_with_addr_offset(<vscale x 1 x i1> %pg, ptr %ptr, i64 %index) { 237; CHECK-LABEL: ld1q_with_addr_offset: 238; CHECK: // %bb.0: 239; CHECK-NEXT: mov w12, wzr 240; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4] 241; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4] 242; CHECK-NEXT: ret 243 %base = getelementptr i128, ptr %ptr, i64 %index 244 call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %base, i32 0, i32 0) 245 call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %base, i32 15, i32 0) 246 ret void; 247} 248 249define void @ldr(ptr %ptr) { 250; CHECK-LABEL: ldr: 251; CHECK: // %bb.0: 252; CHECK-NEXT: mov w12, wzr 253; CHECK-NEXT: ldr za[w12, 0], [x0] 254; CHECK-NEXT: ret 255 call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0) 256 ret void; 257} 258 259define void @ldr_with_off_15(ptr %ptr) { 260; CHECK-LABEL: ldr_with_off_15: 261; CHECK: // %bb.0: 262; CHECK-NEXT: mov w12, #15 // =0xf 263; CHECK-NEXT: add x8, x0, #15 264; CHECK-NEXT: ldr za[w12, 0], [x8] 265; CHECK-NEXT: ret 266 %base = getelementptr i8, ptr %ptr, i64 15 267 call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) 268 ret void; 269} 270 271define void @ldr_with_off_15mulvl(ptr %ptr) { 272; CHECK-LABEL: ldr_with_off_15mulvl: 273; CHECK: // %bb.0: 274; CHECK-NEXT: mov w12, #15 // =0xf 275; CHECK-NEXT: addvl x8, x0, #15 276; CHECK-NEXT: ldr za[w12, 0], [x8] 277; CHECK-NEXT: ret 278 %vscale = call i64 @llvm.vscale.i64() 279 %mulvl = mul i64 %vscale, 240 280 %base = getelementptr i8, ptr %ptr, i64 %mulvl 281 call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) 282 ret void; 283} 284 285define void @ldr_with_off_16mulvl(ptr %ptr) { 286; CHECK-LABEL: ldr_with_off_16mulvl: 287; CHECK: // %bb.0: 288; CHECK-NEXT: mov w12, #16 // =0x10 289; CHECK-NEXT: addvl x8, x0, #16 290; CHECK-NEXT: ldr za[w12, 0], [x8] 291; CHECK-NEXT: ret 292 %vscale = call i64 @llvm.vscale.i64() 293 %mulvl = mul i64 %vscale, 256 294 %base = getelementptr i8, ptr %ptr, i64 %mulvl 295 call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0) 296 ret void; 297} 298 299define void @ldr_with_off_var(ptr %base, i32 %off) { 300; CHECK-LABEL: ldr_with_off_var: 301; CHECK: // %bb.0: 302; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 303; CHECK-NEXT: sxtw x8, w1 304; CHECK-NEXT: rdsvl x9, #1 305; CHECK-NEXT: add w12, w1, #16 306; CHECK-NEXT: madd x8, x9, x8, x0 307; CHECK-NEXT: ldr za[w12, 0], [x8] 308; CHECK-NEXT: ret 309 call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) 310 ret void; 311} 312 313define void @ldr_with_off_15imm(ptr %base) { 314; CHECK-LABEL: ldr_with_off_15imm: 315; CHECK: // %bb.0: 316; CHECK-NEXT: mov w12, #16 // =0x10 317; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] 318; CHECK-NEXT: ret 319 call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15) 320 ret void; 321} 322 323define void @ldr_with_off_16imm(ptr %base) { 324; CHECK-LABEL: ldr_with_off_16imm: 325; CHECK: // %bb.0: 326; CHECK-NEXT: rdsvl x8, #1 327; CHECK-NEXT: mov w12, #32 // =0x20 328; CHECK-NEXT: add x8, x0, x8, lsl #4 329; CHECK-NEXT: ldr za[w12, 0], [x8] 330; CHECK-NEXT: ret 331 call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) 332 ret void; 333} 334 335define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) { 336; CHECK-LABEL: ldr_with_off_many_imm: 337; CHECK: // %bb.0: // %entry 338; CHECK-NEXT: mov w12, w0 339; CHECK-NEXT: ldr za[w12, 1], [x1, #1, mul vl] 340; CHECK-NEXT: ldr za[w12, 2], [x1, #2, mul vl] 341; CHECK-NEXT: ldr za[w12, 3], [x1, #3, mul vl] 342; CHECK-NEXT: ldr za[w12, 4], [x1, #4, mul vl] 343; CHECK-NEXT: ret 344entry: 345 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1) 346 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2) 347 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3) 348 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4) 349 ret void 350} 351 352define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { 353; CHECK-LABEL: ldr_with_off_many_imm_15_18: 354; CHECK: // %bb.0: // %entry 355; CHECK-NEXT: rdsvl x8, #1 356; CHECK-NEXT: mov w12, w0 357; CHECK-NEXT: add x8, x1, x8, lsl #4 358; CHECK-NEXT: ldr za[w12, 15], [x1, #15, mul vl] 359; CHECK-NEXT: add w12, w0, #16 360; CHECK-NEXT: ldr za[w12, 0], [x8] 361; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] 362; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] 363; CHECK-NEXT: ret 364entry: 365 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15) 366 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) 367 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) 368 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) 369 ret void 370} 371 372define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { 373; CHECK-LABEL: ldr_with_off_many_imm_16_19: 374; CHECK: // %bb.0: // %entry 375; CHECK-NEXT: rdsvl x8, #1 376; CHECK-NEXT: add w12, w0, #16 377; CHECK-NEXT: add x8, x1, x8, lsl #4 378; CHECK-NEXT: ldr za[w12, 0], [x8] 379; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] 380; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] 381; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] 382; CHECK-NEXT: ret 383entry: 384 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) 385 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) 386 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) 387 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19) 388 ret void 389} 390 391define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { 392; CHECK-LABEL: ldr_with_off_many_imm_31_34: 393; CHECK: // %bb.0: // %entry 394; CHECK-NEXT: rdsvl x8, #1 395; CHECK-NEXT: add w12, w0, #16 396; CHECK-NEXT: add x9, x1, x8, lsl #4 397; CHECK-NEXT: add x8, x1, x8, lsl #5 398; CHECK-NEXT: ldr za[w12, 15], [x9, #15, mul vl] 399; CHECK-NEXT: add w12, w0, #32 400; CHECK-NEXT: ldr za[w12, 0], [x8] 401; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] 402; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] 403; CHECK-NEXT: ret 404entry: 405 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31) 406 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) 407 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) 408 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) 409 ret void 410} 411 412define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) { 413; CHECK-LABEL: ldr_with_off_many_imm_32_35: 414; CHECK: // %bb.0: // %entry 415; CHECK-NEXT: rdsvl x8, #1 416; CHECK-NEXT: add w12, w0, #32 417; CHECK-NEXT: add x8, x1, x8, lsl #5 418; CHECK-NEXT: ldr za[w12, 0], [x8] 419; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] 420; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] 421; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] 422; CHECK-NEXT: ret 423entry: 424 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) 425 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) 426 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) 427 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35) 428 ret void 429} 430 431define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { 432; CHECK-LABEL: ldr_with_off_many_var: 433; CHECK: // %bb.0: // %entry 434; CHECK-NEXT: sxtw x8, w2 435; CHECK-NEXT: rdsvl x9, #1 436; CHECK-NEXT: add w12, w0, w2 437; CHECK-NEXT: madd x8, x9, x8, x1 438; CHECK-NEXT: ldr za[w12, 0], [x8] 439; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] 440; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] 441; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] 442; CHECK-NEXT: ret 443entry: 444 %0 = trunc i64 %vnum to i32 445 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0) 446 %1 = add i32 %0, 1 447 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) 448 %2 = add i32 %0, 2 449 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) 450 %3 = add i32 %0, 3 451 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) 452 ret void 453} 454 455define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { 456; CHECK-LABEL: ldr_with_off_many_var_high: 457; CHECK: // %bb.0: // %entry 458; CHECK-NEXT: add w8, w2, #32 459; CHECK-NEXT: rdsvl x10, #1 460; CHECK-NEXT: sxtw x9, w8 461; CHECK-NEXT: add w12, w0, w8 462; CHECK-NEXT: madd x9, x10, x9, x1 463; CHECK-NEXT: ldr za[w12, 1], [x9, #1, mul vl] 464; CHECK-NEXT: ldr za[w12, 2], [x9, #2, mul vl] 465; CHECK-NEXT: ldr za[w12, 3], [x9, #3, mul vl] 466; CHECK-NEXT: ldr za[w12, 4], [x9, #4, mul vl] 467; CHECK-NEXT: ret 468entry: 469 %0 = trunc i64 %vnum to i32 470 %1 = add i32 %0, 33 471 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) 472 %2 = add i32 %0, 34 473 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) 474 %3 = add i32 %0, 35 475 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) 476 %4 = add i32 %0, 36 477 tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4) 478 ret void 479} 480 481; Ensure that the tile offset is sunk, given that this is likely to be an 'add' 482; that's decomposed into a base + offset in ISel. 483define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) { 484; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: 485; CHECK: // %bb.0: // %entry 486; CHECK-NEXT: mov w12, w1 487; CHECK-NEXT: .LBB24_1: // %for.body 488; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 489; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] 490; CHECK-NEXT: subs w2, w2, #1 491; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] 492; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] 493; CHECK-NEXT: b.ne .LBB24_1 494; CHECK-NEXT: // %bb.2: // %exit 495; CHECK-NEXT: ret 496entry: 497 %add1 = add i32 %base, 1 498 %add2 = add i32 %base, 2 499 br label %for.body 500 501for.body: 502 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] 503 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %base) 504 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add1) 505 call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add2) 506 %inc = add nuw nsw i32 %i, 1 507 %exitcond.not = icmp eq i32 %inc, %N 508 br i1 %exitcond.not, label %exit, label %for.body 509 510exit: 511 ret void 512} 513 514 515declare void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1>, ptr, i32, i32) 516declare void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1>, ptr, i32, i32) 517declare void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1>, ptr, i32, i32) 518declare void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1>, ptr, i32, i32) 519declare void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1>, ptr, i32, i32) 520declare void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1>, ptr, i32, i32) 521declare void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1>, ptr, i32, i32) 522declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32) 523declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32) 524declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32) 525 526declare void @llvm.aarch64.sme.ldr(i32, ptr, i32) 527declare i64 @llvm.vscale.i64() 528