1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s 3 4; 5; LD1B 6; 7 8define <vscale x 16 x i8> @ld1b_upper_bound(<vscale x 16 x i1> %pg, ptr %a) { 9; CHECK-LABEL: ld1b_upper_bound: 10; CHECK: // %bb.0: 11; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #7, mul vl] 12; CHECK-NEXT: ret 13 %base = getelementptr <vscale x 16 x i8>, ptr %a, i64 7 14 %base_scalar = bitcast ptr %base to ptr 15 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_scalar) 16 ret <vscale x 16 x i8> %load 17} 18 19define <vscale x 16 x i8> @ld1b_inbound(<vscale x 16 x i1> %pg, ptr %a) { 20; CHECK-LABEL: ld1b_inbound: 21; CHECK: // %bb.0: 22; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] 23; CHECK-NEXT: ret 24 %base = getelementptr <vscale x 16 x i8>, ptr %a, i64 1 25 %base_scalar = bitcast ptr %base to ptr 26 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_scalar) 27 ret <vscale x 16 x i8> %load 28} 29 30define <vscale x 4 x i32> @ld1b_s_inbound(<vscale x 4 x i1> %pg, ptr %a) { 31; CHECK-LABEL: ld1b_s_inbound: 32; CHECK: // %bb.0: 33; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #7, mul vl] 34; CHECK-NEXT: ret 35 %base = getelementptr <vscale x 4 x i8>, ptr %a, i64 7 36 %base_scalar = bitcast ptr %base to ptr 37 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, ptr %base_scalar) 38 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 39 ret <vscale x 4 x i32> %res 40} 41 42define <vscale x 4 x i32> @ld1sb_s_inbound(<vscale x 4 x i1> %pg, ptr %a) { 43; CHECK-LABEL: ld1sb_s_inbound: 44; CHECK: // %bb.0: 45; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #7, mul vl] 46; CHECK-NEXT: ret 47 %base = getelementptr <vscale x 4 x i8>, ptr %a, i64 7 48 %base_scalar = bitcast ptr %base to ptr 49 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, ptr %base_scalar) 50 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 51 ret <vscale x 4 x i32> %res 52} 53 54define <vscale x 16 x i8> @ld1b_lower_bound(<vscale x 16 x i1> %pg, ptr %a) { 55; CHECK-LABEL: ld1b_lower_bound: 56; CHECK: // %bb.0: 57; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #-8, mul vl] 58; CHECK-NEXT: ret 59 %base = getelementptr <vscale x 16 x i8>, ptr %a, i64 -8 60 %base_scalar = bitcast ptr %base to ptr 61 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_scalar) 62 ret <vscale x 16 x i8> %load 63} 64 65define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i1> %pg, ptr %a) { 66; CHECK-LABEL: ld1b_out_of_upper_bound: 67; CHECK: // %bb.0: 68; CHECK-NEXT: rdvl x8, #8 69; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 70; CHECK-NEXT: ret 71 %base = getelementptr <vscale x 16 x i8>, ptr %a, i64 8 72 %base_scalar = bitcast ptr %base to ptr 73 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_scalar) 74 ret <vscale x 16 x i8> %load 75} 76 77define <vscale x 16 x i8> @ld1b_out_of_lower_bound(<vscale x 16 x i1> %pg, ptr %a) { 78; CHECK-LABEL: ld1b_out_of_lower_bound: 79; CHECK: // %bb.0: 80; CHECK-NEXT: rdvl x8, #-9 81; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 82; CHECK-NEXT: ret 83 %base = getelementptr <vscale x 16 x i8>, ptr %a, i64 -9 84 %base_scalar = bitcast ptr %base to ptr 85 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, ptr %base_scalar) 86 ret <vscale x 16 x i8> %load 87} 88 89; 90; LD1H 91; 92 93define <vscale x 8 x i16> @ld1b_h_inbound(<vscale x 8 x i1> %pg, ptr %a) { 94; CHECK-LABEL: ld1b_h_inbound: 95; CHECK: // %bb.0: 96; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #7, mul vl] 97; CHECK-NEXT: ret 98 %base = getelementptr <vscale x 8 x i8>, ptr %a, i64 7 99 %base_scalar = bitcast ptr %base to ptr 100 %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, ptr %base_scalar) 101 %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> 102 ret <vscale x 8 x i16> %res 103} 104 105define <vscale x 8 x i16> @ld1sb_h_inbound(<vscale x 8 x i1> %pg, ptr %a) { 106; CHECK-LABEL: ld1sb_h_inbound: 107; CHECK: // %bb.0: 108; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #7, mul vl] 109; CHECK-NEXT: ret 110 %base = getelementptr <vscale x 8 x i8>, ptr %a, i64 7 111 %base_scalar = bitcast ptr %base to ptr 112 %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, ptr %base_scalar) 113 %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> 114 ret <vscale x 8 x i16> %res 115} 116 117define <vscale x 8 x i16> @ld1h_inbound(<vscale x 8 x i1> %pg, ptr %a) { 118; CHECK-LABEL: ld1h_inbound: 119; CHECK: // %bb.0: 120; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 121; CHECK-NEXT: ret 122 %base = getelementptr <vscale x 8 x i16>, ptr %a, i64 1 123 %base_scalar = bitcast ptr %base to ptr 124 %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pg, ptr %base_scalar) 125 ret <vscale x 8 x i16> %load 126} 127 128define <vscale x 4 x i32> @ld1h_s_inbound(<vscale x 4 x i1> %pg, ptr %a) { 129; CHECK-LABEL: ld1h_s_inbound: 130; CHECK: // %bb.0: 131; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #7, mul vl] 132; CHECK-NEXT: ret 133 %base = getelementptr <vscale x 4 x i16>, ptr %a, i64 7 134 %base_scalar = bitcast ptr %base to ptr 135 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, ptr %base_scalar) 136 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 137 ret <vscale x 4 x i32> %res 138} 139 140define <vscale x 4 x i32> @ld1sh_s_inbound(<vscale x 4 x i1> %pg, ptr %a) { 141; CHECK-LABEL: ld1sh_s_inbound: 142; CHECK: // %bb.0: 143; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #7, mul vl] 144; CHECK-NEXT: ret 145 %base = getelementptr <vscale x 4 x i16>, ptr %a, i64 7 146 %base_scalar = bitcast ptr %base to ptr 147 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, ptr %base_scalar) 148 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 149 ret <vscale x 4 x i32> %res 150} 151 152define <vscale x 2 x i64> @ld1b_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 153; CHECK-LABEL: ld1b_d_inbound: 154; CHECK: // %bb.0: 155; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #7, mul vl] 156; CHECK-NEXT: ret 157 %base = getelementptr <vscale x 2 x i8>, ptr %a, i64 7 158 %base_scalar = bitcast ptr %base to ptr 159 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, ptr %base_scalar) 160 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 161 ret <vscale x 2 x i64> %res 162} 163 164define <vscale x 2 x i64> @ld1sb_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 165; CHECK-LABEL: ld1sb_d_inbound: 166; CHECK: // %bb.0: 167; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #7, mul vl] 168; CHECK-NEXT: ret 169 %base = getelementptr <vscale x 2 x i8>, ptr %a, i64 7 170 %base_scalar = bitcast ptr %base to ptr 171 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, ptr %base_scalar) 172 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 173 ret <vscale x 2 x i64> %res 174} 175 176define <vscale x 2 x i64> @ld1h_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 177; CHECK-LABEL: ld1h_d_inbound: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #7, mul vl] 180; CHECK-NEXT: ret 181 %base = getelementptr <vscale x 2 x i16>, ptr %a, i64 7 182 %base_scalar = bitcast ptr %base to ptr 183 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, ptr %base_scalar) 184 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 185 ret <vscale x 2 x i64> %res 186} 187 188define <vscale x 2 x i64> @ld1sh_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 189; CHECK-LABEL: ld1sh_d_inbound: 190; CHECK: // %bb.0: 191; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #7, mul vl] 192; CHECK-NEXT: ret 193 %base = getelementptr <vscale x 2 x i16>, ptr %a, i64 7 194 %base_scalar = bitcast ptr %base to ptr 195 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, ptr %base_scalar) 196 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 197 ret <vscale x 2 x i64> %res 198} 199 200define <vscale x 8 x half> @ld1h_f16_inbound(<vscale x 8 x i1> %pg, ptr %a) { 201; CHECK-LABEL: ld1h_f16_inbound: 202; CHECK: // %bb.0: 203; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 204; CHECK-NEXT: ret 205 %base = getelementptr <vscale x 8 x half>, ptr %a, i64 1 206 %base_scalar = bitcast ptr %base to ptr 207 %load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pg, ptr %base_scalar) 208 ret <vscale x 8 x half> %load 209} 210 211define <vscale x 8 x bfloat> @ld1h_bf16_inbound(<vscale x 8 x i1> %pg, ptr %a) #0 { 212; CHECK-LABEL: ld1h_bf16_inbound: 213; CHECK: // %bb.0: 214; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 215; CHECK-NEXT: ret 216 %base = getelementptr <vscale x 8 x bfloat>, ptr %a, i64 1 217 %base_scalar = bitcast ptr %base to ptr 218 %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, ptr %base_scalar) 219 ret <vscale x 8 x bfloat> %load 220} 221 222; 223; LD1W 224; 225 226define <vscale x 4 x i32> @ld1w_inbound(<vscale x 4 x i1> %pg, ptr %a) { 227; CHECK-LABEL: ld1w_inbound: 228; CHECK: // %bb.0: 229; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #7, mul vl] 230; CHECK-NEXT: ret 231 %base = getelementptr <vscale x 4 x i32>, ptr %a, i64 7 232 %base_scalar = bitcast ptr %base to ptr 233 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pg, ptr %base_scalar) 234 ret <vscale x 4 x i32> %load 235} 236 237define <vscale x 4 x float> @ld1w_f32_inbound(<vscale x 4 x i1> %pg, ptr %a) { 238; CHECK-LABEL: ld1w_f32_inbound: 239; CHECK: // %bb.0: 240; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #7, mul vl] 241; CHECK-NEXT: ret 242 %base = getelementptr <vscale x 4 x float>, ptr %a, i64 7 243 %base_scalar = bitcast ptr %base to ptr 244 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pg, ptr %base_scalar) 245 ret <vscale x 4 x float> %load 246} 247 248; 249; LD1D 250; 251 252define <vscale x 2 x i64> @ld1d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 253; CHECK-LABEL: ld1d_inbound: 254; CHECK: // %bb.0: 255; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] 256; CHECK-NEXT: ret 257 %base = getelementptr <vscale x 2 x i64>, ptr %a, i64 1 258 %base_scalar = bitcast ptr %base to ptr 259 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pg, ptr %base_scalar) 260 ret <vscale x 2 x i64> %load 261} 262 263define <vscale x 2 x i64> @ld1w_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 264; CHECK-LABEL: ld1w_d_inbound: 265; CHECK: // %bb.0: 266; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #7, mul vl] 267; CHECK-NEXT: ret 268 %base = getelementptr <vscale x 2 x i32>, ptr %a, i64 7 269 %base_scalar = bitcast ptr %base to ptr 270 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, ptr %base_scalar) 271 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 272 ret <vscale x 2 x i64> %res 273} 274 275define <vscale x 2 x i64> @ld1sw_d_inbound(<vscale x 2 x i1> %pg, ptr %a) { 276; CHECK-LABEL: ld1sw_d_inbound: 277; CHECK: // %bb.0: 278; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #7, mul vl] 279; CHECK-NEXT: ret 280 %base = getelementptr <vscale x 2 x i32>, ptr %a, i64 7 281 %base_scalar = bitcast ptr %base to ptr 282 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, ptr %base_scalar) 283 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 284 ret <vscale x 2 x i64> %res 285} 286 287define <vscale x 2 x double> @ld1d_f64_inbound(<vscale x 2 x i1> %pg, ptr %a) { 288; CHECK-LABEL: ld1d_f64_inbound: 289; CHECK: // %bb.0: 290; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] 291; CHECK-NEXT: ret 292 %base = getelementptr <vscale x 2 x double>, ptr %a, i64 1 293 %base_scalar = bitcast ptr %base to ptr 294 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pg, ptr %base_scalar) 295 ret <vscale x 2 x double> %load 296} 297 298declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, ptr) 299 300declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, ptr) 301declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, ptr) 302declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, ptr) 303declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, ptr) 304 305declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, ptr) 306declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, ptr) 307declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, ptr) 308declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, ptr) 309 310declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, ptr) 311declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, ptr) 312declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, ptr) 313declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, ptr) 314declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, ptr) 315 316; +bf16 is required for the bfloat version. 317attributes #0 = { "target-features"="+sve,+bf16" } 318