1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s 3; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops 4; depending on it. 5 6define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(ptr nocapture readonly %ptr) { 7; CHECK-LABEL: test_vld1_bf16: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vld1.16 {d0}, [r0] 10; CHECK-NEXT: bx lr 11entry: 12 %0 = load <4 x bfloat>, ptr %ptr, align 2 13 ret <4 x bfloat> %0 14} 15 16define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(ptr nocapture readonly %ptr) { 17; CHECK-LABEL: test_vld1q_bf16: 18; CHECK: @ %bb.0: @ %entry 19; CHECK-NEXT: vld1.16 {d0, d1}, [r0] 20; CHECK-NEXT: bx lr 21entry: 22 %0 = load <8 x bfloat>, ptr %ptr, align 2 23 ret <8 x bfloat> %0 24} 25 26define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(ptr nocapture readonly %ptr, <4 x bfloat> %src) { 27; CHECK-LABEL: test_vld1_lane_bf16: 28; CHECK: @ %bb.0: @ %entry 29; CHECK-NEXT: vld1.16 {d0[0]}, [r0:16] 30; CHECK-NEXT: bx lr 31entry: 32 %0 = load bfloat, ptr %ptr, align 2 33 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 34 ret <4 x bfloat> %vld1_lane 35} 36 37define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(ptr nocapture readonly %ptr, <8 x bfloat> %src) { 38; CHECK-LABEL: test_vld1q_lane_bf16: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vld1.16 {d1[3]}, [r0:16] 41; CHECK-NEXT: bx lr 42entry: 43 %0 = load bfloat, ptr %ptr, align 2 44 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 45 ret <8 x bfloat> %vld1_lane 46} 47 48define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(ptr nocapture readonly %ptr) { 49; CHECK-LABEL: test_vld1_dup_bf16: 50; CHECK: @ %bb.0: @ %entry 51; CHECK-NEXT: vld1.16 {d0[]}, [r0:16] 52; CHECK-NEXT: bx lr 53entry: 54 %0 = load bfloat, ptr %ptr, align 2 55 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 56 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer 57 ret <4 x bfloat> %lane 58} 59 60define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(ptr %ptr) { 61; CHECK-LABEL: test_vld1_bf16_x2: 62; CHECK: @ %bb.0: @ %entry 63; CHECK-NEXT: vld1.16 {d0, d1}, [r0] 64; CHECK-NEXT: bx lr 65entry: 66 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr %ptr) 67 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 68 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 69 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 70 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 71 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0 72 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 73 ret [2 x <2 x i32>] %.fca.1.insert 74} 75 76define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(ptr %ptr) { 77; CHECK-LABEL: test_vld1q_bf16_x2: 78; CHECK: @ %bb.0: @ %entry 79; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0] 80; CHECK-NEXT: bx lr 81entry: 82 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr %ptr) 83 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 84 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 85 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 86 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 87 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0 88 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 89 ret [2 x <4 x i32>] %.fca.1.insert 90} 91 92define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(ptr %ptr) { 93; CHECK-LABEL: test_vld1_bf16_x3: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0] 96; CHECK-NEXT: bx lr 97entry: 98 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr %ptr) 99 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 100 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 101 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 102 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 103 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 104 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32> 105 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0 106 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 107 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 108 ret [3 x <2 x i32>] %.fca.2.insert 109} 110 111define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(ptr %ptr) { 112; CHECK-LABEL: test_vld1q_bf16_x3: 113; CHECK: @ %bb.0: @ %entry 114; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0]! 115; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0] 116; CHECK-NEXT: bx lr 117entry: 118 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr %ptr) 119 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 120 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 121 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 122 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 123 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 124 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32> 125 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0 126 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 127 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 128 ret [3 x <4 x i32>] %.fca.2.insert 129} 130 131define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(ptr %ptr) { 132; CHECK-LABEL: test_vld1_bf16_x4: 133; CHECK: @ %bb.0: @ %entry 134; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0] 135; CHECK-NEXT: bx lr 136entry: 137 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr %ptr) 138 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 139 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 140 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 141 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3 142 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 143 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 144 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32> 145 %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32> 146 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0 147 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 148 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 149 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3 150 ret [4 x <2 x i32>] %.fca.3.insert 151} 152 153define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(ptr %ptr) { 154; CHECK-LABEL: test_vld1q_bf16_x4: 155; CHECK: @ %bb.0: @ %entry 156; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0]! 157; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0] 158; CHECK-NEXT: bx lr 159entry: 160 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr %ptr) 161 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 162 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 163 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 164 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3 165 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 166 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 167 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32> 168 %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32> 169 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0 170 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 171 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 172 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3 173 ret [4 x <4 x i32>] %.fca.3.insert 174} 175 176define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(ptr nocapture readonly %ptr) { 177; CHECK-LABEL: test_vld1q_dup_bf16: 178; CHECK: @ %bb.0: @ %entry 179; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16] 180; CHECK-NEXT: bx lr 181entry: 182 %0 = load bfloat, ptr %ptr, align 2 183 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 184 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer 185 ret <8 x bfloat> %lane 186} 187 188define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(ptr %ptr) { 189; CHECK-LABEL: test_vld2_bf16: 190; CHECK: @ %bb.0: @ %entry 191; CHECK-NEXT: vld2.16 {d0, d1}, [r0] 192; CHECK-NEXT: bx lr 193entry: 194 %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr %ptr, i32 2) 195 %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0 196 %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1 197 %0 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32> 198 %1 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32> 199 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0 200 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 201 ret [2 x <2 x i32>] %.fca.1.insert 202} 203 204define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(ptr %ptr) { 205; CHECK-LABEL: test_vld2q_bf16: 206; CHECK: @ %bb.0: @ %entry 207; CHECK-NEXT: vld2.16 {d0, d1, d2, d3}, [r0] 208; CHECK-NEXT: bx lr 209entry: 210 %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr %ptr, i32 2) 211 %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0 212 %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1 213 %0 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32> 214 %1 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32> 215 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0 216 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 217 ret [2 x <4 x i32>] %.fca.1.insert 218} 219 220define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %src.coerce) { 221; CHECK-LABEL: test_vld2_lane_bf16: 222; CHECK: @ %bb.0: @ %entry 223; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 224; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 225; CHECK-NEXT: vld2.16 {d0[1], d1[1]}, [r0] 226; CHECK-NEXT: bx lr 227entry: 228 %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0 229 %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1 230 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 231 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 232 %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) 233 %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0 234 %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1 235 %2 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32> 236 %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32> 237 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %2, 0 238 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %3, 1 239 ret [2 x <2 x i32>] %.fca.1.insert 240} 241 242define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %src.coerce) { 243; CHECK-LABEL: test_vld2q_lane_bf16: 244; CHECK: @ %bb.0: @ %entry 245; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 246; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 247; CHECK-NEXT: vld2.16 {d1[3], d3[3]}, [r0] 248; CHECK-NEXT: bx lr 249entry: 250 %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0 251 %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1 252 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 253 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 254 %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) 255 %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0 256 %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1 257 %2 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32> 258 %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32> 259 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %2, 0 260 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %3, 1 261 ret [2 x <4 x i32>] %.fca.1.insert 262} 263 264define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(ptr %ptr) { 265; CHECK-LABEL: test_vld3_bf16: 266; CHECK: @ %bb.0: @ %entry 267; CHECK-NEXT: vld3.16 {d0, d1, d2}, [r0] 268; CHECK-NEXT: bx lr 269entry: 270 %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr %ptr, i32 2) 271 %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0 272 %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1 273 %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2 274 %0 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32> 275 %1 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32> 276 %2 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32> 277 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0 278 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 279 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 280 ret [3 x <2 x i32>] %.fca.2.insert 281} 282 283define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(ptr %ptr) { 284; CHECK-LABEL: test_vld3q_bf16: 285; CHECK: @ %bb.0: @ %entry 286; CHECK-NEXT: vld3.16 {d0, d2, d4}, [r0]! 287; CHECK-NEXT: vld3.16 {d1, d3, d5}, [r0] 288; CHECK-NEXT: bx lr 289entry: 290 %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr %ptr, i32 2) 291 %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0 292 %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1 293 %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2 294 %0 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32> 295 %1 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32> 296 %2 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32> 297 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0 298 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 299 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 300 ret [3 x <4 x i32>] %.fca.2.insert 301} 302 303define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %src.coerce) { 304; CHECK-LABEL: test_vld3_lane_bf16: 305; CHECK: @ %bb.0: @ %entry 306; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 307; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 308; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 309; CHECK-NEXT: vld3.16 {d0[1], d1[1], d2[1]}, [r0] 310; CHECK-NEXT: bx lr 311entry: 312 %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0 313 %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1 314 %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2 315 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 316 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 317 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat> 318 %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) 319 %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0 320 %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1 321 %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2 322 %3 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32> 323 %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32> 324 %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32> 325 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %3, 0 326 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1 327 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %5, 2 328 ret [3 x <2 x i32>] %.fca.2.insert 329} 330 331define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %src.coerce) { 332; CHECK-LABEL: test_vld3q_lane_bf16: 333; CHECK: @ %bb.0: @ %entry 334; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 335; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 336; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 337; CHECK-NEXT: vld3.16 {d1[3], d3[3], d5[3]}, [r0] 338; CHECK-NEXT: bx lr 339entry: 340 %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0 341 %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1 342 %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2 343 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 344 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 345 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat> 346 %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) 347 %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0 348 %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1 349 %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2 350 %3 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32> 351 %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32> 352 %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32> 353 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %3, 0 354 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1 355 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %5, 2 356 ret [3 x <4 x i32>] %.fca.2.insert 357} 358 359define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(ptr %ptr) { 360; CHECK-LABEL: test_vld4_bf16: 361; CHECK: @ %bb.0: @ %entry 362; CHECK-NEXT: vld4.16 {d0, d1, d2, d3}, [r0] 363; CHECK-NEXT: bx lr 364entry: 365 %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr %ptr, i32 2) 366 %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0 367 %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1 368 %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2 369 %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3 370 %0 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32> 371 %1 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32> 372 %2 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32> 373 %3 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32> 374 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0 375 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 376 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 377 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3 378 ret [4 x <2 x i32>] %.fca.3.insert 379} 380 381define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(ptr %ptr) { 382; CHECK-LABEL: test_vld4q_bf16: 383; CHECK: @ %bb.0: @ %entry 384; CHECK-NEXT: vld4.16 {d0, d2, d4, d6}, [r0]! 385; CHECK-NEXT: vld4.16 {d1, d3, d5, d7}, [r0] 386; CHECK-NEXT: bx lr 387entry: 388 %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr %ptr, i32 2) 389 %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0 390 %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1 391 %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2 392 %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3 393 %0 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32> 394 %1 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32> 395 %2 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32> 396 %3 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32> 397 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0 398 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 399 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 400 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3 401 ret [4 x <4 x i32>] %.fca.3.insert 402} 403 404define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %src.coerce) { 405; CHECK-LABEL: test_vld4_lane_bf16: 406; CHECK: @ %bb.0: @ %entry 407; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 408; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 409; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 410; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 411; CHECK-NEXT: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0] 412; CHECK-NEXT: bx lr 413entry: 414 %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0 415 %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1 416 %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2 417 %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3 418 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 419 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 420 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat> 421 %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat> 422 %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) 423 %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0 424 %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1 425 %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2 426 %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3 427 %4 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32> 428 %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32> 429 %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32> 430 %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32> 431 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %4, 0 432 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1 433 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2 434 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %7, 3 435 ret [4 x <2 x i32>] %.fca.3.insert 436} 437 438define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %src.coerce) { 439; CHECK-LABEL: test_vld4q_lane_bf16: 440; CHECK: @ %bb.0: @ %entry 441; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 442; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 443; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 444; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 445; CHECK-NEXT: vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0] 446; CHECK-NEXT: bx lr 447entry: 448 %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0 449 %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1 450 %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2 451 %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3 452 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 453 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 454 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat> 455 %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat> 456 %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) 457 %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0 458 %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1 459 %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2 460 %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3 461 %4 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32> 462 %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32> 463 %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32> 464 %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32> 465 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %4, 0 466 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1 467 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2 468 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %7, 3 469 ret [4 x <4 x i32>] %.fca.3.insert 470} 471 472define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(ptr %ptr) { 473; CHECK-LABEL: test_vld2_dup_bf16: 474; CHECK: @ %bb.0: @ %entry 475; CHECK-NEXT: vld2.16 {d0[], d1[]}, [r0] 476; CHECK-NEXT: bx lr 477entry: 478 %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr %ptr, i32 2) 479 %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0 480 %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1 481 %0 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32> 482 %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32> 483 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0 484 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 485 ret [2 x <2 x i32>] %.fca.1.insert 486} 487 488define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(ptr %ptr) { 489; CHECK-LABEL: test_vld2q_dup_bf16: 490; CHECK: @ %bb.0: @ %entry 491; CHECK-NEXT: vld2.16 {d0[], d2[]}, [r0] 492; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0] 493; CHECK-NEXT: bx lr 494entry: 495 %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr %ptr, i32 2) 496 %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0 497 %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1 498 %0 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32> 499 %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32> 500 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0 501 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 502 ret [2 x <4 x i32>] %.fca.1.insert 503} 504 505define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(ptr %ptr) { 506; CHECK-LABEL: test_vld3_dup_bf16: 507; CHECK: @ %bb.0: @ %entry 508; CHECK-NEXT: vld3.16 {d0[], d1[], d2[]}, [r0] 509; CHECK-NEXT: bx lr 510entry: 511 %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr %ptr, i32 2) 512 %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0 513 %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1 514 %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2 515 %0 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32> 516 %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32> 517 %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32> 518 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0 519 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 520 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 521 ret [3 x <2 x i32>] %.fca.2.insert 522} 523 524define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(ptr %ptr) { 525; CHECK-LABEL: test_vld3q_dup_bf16: 526; CHECK: @ %bb.0: @ %entry 527; CHECK-NEXT: vld3.16 {d0[], d2[], d4[]}, [r0] 528; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0] 529; CHECK-NEXT: bx lr 530entry: 531 %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr %ptr, i32 2) 532 %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0 533 %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1 534 %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2 535 %0 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32> 536 %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32> 537 %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32> 538 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0 539 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 540 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 541 ret [3 x <4 x i32>] %.fca.2.insert 542} 543 544define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(ptr %ptr) { 545; CHECK-LABEL: test_vld4_dup_bf16: 546; CHECK: @ %bb.0: @ %entry 547; CHECK-NEXT: vld4.16 {d0[], d1[], d2[], d3[]}, [r0] 548; CHECK-NEXT: bx lr 549entry: 550 %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr %ptr, i32 2) 551 %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0 552 %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1 553 %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2 554 %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3 555 %0 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32> 556 %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32> 557 %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32> 558 %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32> 559 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0 560 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 561 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 562 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3 563 ret [4 x <2 x i32>] %.fca.3.insert 564} 565 566define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(ptr %ptr) { 567; CHECK-LABEL: test_vld4q_dup_bf16: 568; CHECK: @ %bb.0: @ %entry 569; CHECK-NEXT: vld4.16 {d0[], d2[], d4[], d6[]}, [r0] 570; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0] 571; CHECK-NEXT: bx lr 572entry: 573 %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr %ptr, i32 2) 574 %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0 575 %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1 576 %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2 577 %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3 578 %0 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32> 579 %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32> 580 %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32> 581 %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32> 582 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0 583 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 584 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 585 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3 586 ret [4 x <4 x i32>] %.fca.3.insert 587} 588 589define arm_aapcs_vfpcc void @test_vst1_bf16(ptr %ptr, <4 x bfloat> %val) { 590; CHECK-LABEL: test_vst1_bf16: 591; CHECK: @ %bb.0: @ %entry 592; CHECK-NEXT: vst1.16 {d0}, [r0] 593; CHECK-NEXT: bx lr 594entry: 595 tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr %ptr, <4 x bfloat> %val, i32 2) 596 ret void 597} 598 599define arm_aapcs_vfpcc void @test_vst1q_bf16(ptr %ptr, <8 x bfloat> %val) { 600; CHECK-LABEL: test_vst1q_bf16: 601; CHECK: @ %bb.0: @ %entry 602; CHECK-NEXT: vst1.16 {d0, d1}, [r0] 603; CHECK-NEXT: bx lr 604entry: 605 tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr %ptr, <8 x bfloat> %val, i32 2) 606 ret void 607} 608 609define arm_aapcs_vfpcc void @test_vst1_lane_bf16(ptr nocapture %ptr, <4 x bfloat> %val) { 610; CHECK-LABEL: test_vst1_lane_bf16: 611; CHECK: @ %bb.0: @ %entry 612; CHECK-NEXT: vmovx.f16 s0, s0 613; CHECK-NEXT: vstr.16 s0, [r0] 614; CHECK-NEXT: bx lr 615entry: 616 %0 = extractelement <4 x bfloat> %val, i32 1 617 store bfloat %0, ptr %ptr, align 2 618 ret void 619} 620 621define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(ptr nocapture %ptr, <8 x bfloat> %val) { 622; CHECK-LABEL: test_vst1q_lane_bf16: 623; CHECK: @ %bb.0: @ %entry 624; CHECK-NEXT: vmovx.f16 s0, s3 625; CHECK-NEXT: vstr.16 s0, [r0] 626; CHECK-NEXT: bx lr 627entry: 628 %0 = extractelement <8 x bfloat> %val, i32 7 629 store bfloat %0, ptr %ptr, align 2 630 ret void 631} 632 633define arm_aapcs_vfpcc void @test_vst1_bf16_x2(ptr nocapture %ptr, [2 x <2 x i32>] %val.coerce) { 634; CHECK-LABEL: test_vst1_bf16_x2: 635; CHECK: @ %bb.0: @ %entry 636; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 637; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 638; CHECK-NEXT: vst1.16 {d0, d1}, [r0] 639; CHECK-NEXT: bx lr 640entry: 641 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 642 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 643 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 644 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 645 tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1) 646 ret void 647} 648 649define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(ptr nocapture %ptr, [2 x <4 x i32>] %val.coerce) { 650; CHECK-LABEL: test_vst1q_bf16_x2: 651; CHECK: @ %bb.0: @ %entry 652; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 653; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 654; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0] 655; CHECK-NEXT: bx lr 656entry: 657 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 658 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 659 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 660 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 661 tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1) 662 ret void 663} 664 665define arm_aapcs_vfpcc void @test_vst1_bf16_x3(ptr nocapture %ptr, [3 x <2 x i32>] %val.coerce) { 666; CHECK-LABEL: test_vst1_bf16_x3: 667; CHECK: @ %bb.0: @ %entry 668; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 669; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 670; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 671; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0] 672; CHECK-NEXT: bx lr 673entry: 674 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 675 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 676 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 677 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 678 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 679 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 680 tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2) 681 ret void 682} 683 684define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(ptr nocapture %ptr, [3 x <4 x i32>] %val.coerce) { 685; CHECK-LABEL: test_vst1q_bf16_x3: 686; CHECK: @ %bb.0: @ %entry 687; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 688; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 689; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 690; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0]! 691; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0] 692; CHECK-NEXT: bx lr 693entry: 694 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 695 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 696 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 697 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 698 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 699 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 700 tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2) 701 ret void 702} 703 704define arm_aapcs_vfpcc void @test_vst1_bf16_x4(ptr nocapture %ptr, [4 x <2 x i32>] %val.coerce) { 705; CHECK-LABEL: test_vst1_bf16_x4: 706; CHECK: @ %bb.0: @ %entry 707; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 708; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 709; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 710; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 711; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0] 712; CHECK-NEXT: bx lr 713entry: 714 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 715 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 716 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 717 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 718 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 719 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 720 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 721 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 722 tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3) 723 ret void 724} 725 726define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(ptr nocapture %ptr, [4 x <4 x i32>] %val.coerce) { 727; CHECK-LABEL: test_vst1q_bf16_x4: 728; CHECK: @ %bb.0: @ %entry 729; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 730; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 731; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 732; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 733; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0]! 734; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0] 735; CHECK-NEXT: bx lr 736entry: 737 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 738 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 739 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 740 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 741 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 742 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 743 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 744 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 745 tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3) 746 ret void 747} 748 749define arm_aapcs_vfpcc void @test_vst2_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) { 750; CHECK-LABEL: test_vst2_bf16: 751; CHECK: @ %bb.0: @ %entry 752; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 753; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 754; CHECK-NEXT: vst2.16 {d0, d1}, [r0] 755; CHECK-NEXT: bx lr 756entry: 757 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 758 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 759 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 760 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 761 tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 2) 762 ret void 763} 764 765define arm_aapcs_vfpcc void @test_vst2q_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) { 766; CHECK-LABEL: test_vst2q_bf16: 767; CHECK: @ %bb.0: @ %entry 768; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 769; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 770; CHECK-NEXT: vst2.16 {d0, d1, d2, d3}, [r0] 771; CHECK-NEXT: bx lr 772entry: 773 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 774 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 775 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 776 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 777 tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 2) 778 ret void 779} 780 781define arm_aapcs_vfpcc void @test_vst2_lane_bf16(ptr %ptr, [2 x <2 x i32>] %val.coerce) { 782; CHECK-LABEL: test_vst2_lane_bf16: 783; CHECK: @ %bb.0: @ %entry 784; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 785; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 786; CHECK-NEXT: vst2.16 {d0[1], d1[1]}, [r0] 787; CHECK-NEXT: bx lr 788entry: 789 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 790 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 791 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 792 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 793 tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) 794 ret void 795} 796 797define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(ptr %ptr, [2 x <4 x i32>] %val.coerce) { 798; CHECK-LABEL: test_vst2q_lane_bf16: 799; CHECK: @ %bb.0: @ %entry 800; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 801; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 802; CHECK-NEXT: vst2.16 {d1[3], d3[3]}, [r0] 803; CHECK-NEXT: bx lr 804entry: 805 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 806 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 807 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 808 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 809 tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) 810 ret void 811} 812 813define arm_aapcs_vfpcc void @test_vst3_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) { 814; CHECK-LABEL: test_vst3_bf16: 815; CHECK: @ %bb.0: @ %entry 816; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 817; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 818; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 819; CHECK-NEXT: vst3.16 {d0, d1, d2}, [r0] 820; CHECK-NEXT: bx lr 821entry: 822 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 823 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 824 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 825 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 826 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 827 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 828 tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2) 829 ret void 830} 831 832define arm_aapcs_vfpcc void @test_vst3q_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) { 833; CHECK-LABEL: test_vst3q_bf16: 834; CHECK: @ %bb.0: @ %entry 835; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 836; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 837; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 838; CHECK-NEXT: vst3.16 {d0, d2, d4}, [r0]! 839; CHECK-NEXT: vst3.16 {d1, d3, d5}, [r0] 840; CHECK-NEXT: bx lr 841entry: 842 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 843 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 844 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 845 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 846 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 847 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 848 tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2) 849 ret void 850} 851 852define arm_aapcs_vfpcc void @test_vst3_lane_bf16(ptr %ptr, [3 x <2 x i32>] %val.coerce) { 853; CHECK-LABEL: test_vst3_lane_bf16: 854; CHECK: @ %bb.0: @ %entry 855; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 856; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 857; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 858; CHECK-NEXT: vst3.16 {d0[1], d1[1], d2[1]}, [r0] 859; CHECK-NEXT: bx lr 860entry: 861 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 862 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 863 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 864 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 865 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 866 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 867 tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) 868 ret void 869} 870 871define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(ptr %ptr, [3 x <4 x i32>] %val.coerce) { 872; CHECK-LABEL: test_vst3q_lane_bf16: 873; CHECK: @ %bb.0: @ %entry 874; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 875; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 876; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 877; CHECK-NEXT: vst3.16 {d1[3], d3[3], d5[3]}, [r0] 878; CHECK-NEXT: bx lr 879entry: 880 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 881 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 882 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 883 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 884 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 885 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 886 tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) 887 ret void 888} 889 890define arm_aapcs_vfpcc void @test_vst4_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) { 891; CHECK-LABEL: test_vst4_bf16: 892; CHECK: @ %bb.0: @ %entry 893; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 894; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 895; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 896; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 897; CHECK-NEXT: vst4.16 {d0, d1, d2, d3}, [r0] 898; CHECK-NEXT: bx lr 899entry: 900 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 901 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 902 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 903 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 904 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 905 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 906 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 907 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 908 tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2) 909 ret void 910} 911 912define arm_aapcs_vfpcc void @test_vst4q_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) { 913; CHECK-LABEL: test_vst4q_bf16: 914; CHECK: @ %bb.0: @ %entry 915; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 916; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 917; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 918; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 919; CHECK-NEXT: vst4.16 {d0, d2, d4, d6}, [r0]! 920; CHECK-NEXT: vst4.16 {d1, d3, d5, d7}, [r0] 921; CHECK-NEXT: bx lr 922entry: 923 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 924 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 925 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 926 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 927 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 928 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 929 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 930 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 931 tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2) 932 ret void 933} 934 935define arm_aapcs_vfpcc void @test_vst4_lane_bf16(ptr %ptr, [4 x <2 x i32>] %val.coerce) { 936; CHECK-LABEL: test_vst4_lane_bf16: 937; CHECK: @ %bb.0: @ %entry 938; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 939; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 940; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 941; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 942; CHECK-NEXT: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0] 943; CHECK-NEXT: bx lr 944entry: 945 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 946 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 947 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 948 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 949 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 950 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 951 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 952 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 953 tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) 954 ret void 955} 956 957define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(ptr %ptr, [4 x <4 x i32>] %val.coerce) { 958; CHECK-LABEL: test_vst4q_lane_bf16: 959; CHECK: @ %bb.0: @ %entry 960; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 961; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 962; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 963; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 964; CHECK-NEXT: vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0] 965; CHECK-NEXT: bx lr 966entry: 967 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 968 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 969 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 970 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 971 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 972 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 973 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 974 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 975 tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) 976 ret void 977} 978 979declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr, i32) 980declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr, i32) 981declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr, i32) 982declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr, i32) 983declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr, i32) 984declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr, i32) 985 986declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr, i32) 987declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr, i32) 988declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr, i32) 989declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr, i32) 990declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr, i32) 991declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr, i32) 992 993declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr) 994declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr) 995declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr) 996declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr) 997declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr) 998declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr) 999 1000declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32) 1001declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32) 1002declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1003declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1004declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1005declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1006 1007declare void @llvm.arm.neon.vst1.p0.v4bf16(ptr, <4 x bfloat>, i32) 1008declare void @llvm.arm.neon.vst1.p0.v8bf16(ptr, <8 x bfloat>, i32) 1009declare void @llvm.arm.neon.vst2.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32) 1010declare void @llvm.arm.neon.vst2.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32) 1011declare void @llvm.arm.neon.vst3.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32) 1012declare void @llvm.arm.neon.vst3.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32) 1013declare void @llvm.arm.neon.vst4.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32) 1014declare void @llvm.arm.neon.vst4.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32) 1015 1016declare void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>) 1017declare void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>) 1018declare void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 1019declare void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>) 1020declare void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 1021declare void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>) 1022 1023declare void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, i32, i32) 1024declare void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, i32, i32) 1025declare void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1026declare void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1027declare void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1028declare void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1029