1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 2 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ 3 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 4 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ 5 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 6 7 // REQUIRES: arm-registered-target,aarch64-registered-target 8 9 #include "arm_neon.h" 10 11 // CHECK-LABEL: @test_vld1_bf16( 12 // CHECK-NEXT: entry: 13 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr [[PTR:%.*]], align 2 14 // CHECK-NEXT: ret <4 x bfloat> [[TMP1]] 15 // 16 bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) { 17 return vld1_bf16(ptr); 18 } 19 20 // CHECK-LABEL: @test_vld1q_bf16( 21 // CHECK-NEXT: entry: 22 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, ptr [[PTR:%.*]], align 2 23 // CHECK-NEXT: ret <8 x bfloat> [[TMP1]] 24 // 25 bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) { 26 return vld1q_bf16(ptr); 27 } 28 29 // CHECK-LABEL: @test_vld1_lane_bf16( 30 // CHECK-NEXT: entry: 31 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 32 // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 0 33 // CHECK-NEXT: ret <4 x bfloat> [[VLD1_LANE]] 34 // 35 bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) { 36 return vld1_lane_bf16(ptr, src, 0); 37 } 38 39 // CHECK-LABEL: @test_vld1q_lane_bf16( 40 // CHECK-NEXT: entry: 41 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 42 // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 7 43 // CHECK-NEXT: ret <8 x bfloat> [[VLD1_LANE]] 44 // 45 bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) { 46 return vld1q_lane_bf16(ptr, src, 7); 47 } 48 49 // CHECK-LABEL: @test_vld1_dup_bf16( 50 // CHECK-NEXT: entry: 51 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 52 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x bfloat> poison, bfloat [[TMP0]], i64 0 53 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <4 x i32> zeroinitializer 54 // CHECK-NEXT: ret <4 x bfloat> [[LANE]] 55 // 56 bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) { 57 return vld1_dup_bf16(ptr); 58 } 59 60 // CHECK64-LABEL: @test_vld1_bf16_x2( 61 // CHECK64-NEXT: entry: 62 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr [[PTR:%.*]]) 63 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 64 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 65 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 66 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 67 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 68 // 69 // CHECK32-LABEL: @test_vld1_bf16_x2( 70 // CHECK32-NEXT: entry: 71 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr [[PTR:%.*]]) 72 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 73 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 74 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 75 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 76 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 77 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 78 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 79 // 80 bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) { 81 return vld1_bf16_x2(ptr); 82 } 83 84 // CHECK64-LABEL: @test_vld1q_bf16_x2( 85 // CHECK64-NEXT: entry: 86 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr [[PTR:%.*]]) 87 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 88 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 89 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 90 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 91 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 92 // 93 // CHECK32-LABEL: @test_vld1q_bf16_x2( 94 // CHECK32-NEXT: entry: 95 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr [[PTR:%.*]]) 96 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 97 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 98 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 99 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 100 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 101 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 102 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 103 // 104 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) { 105 return vld1q_bf16_x2(ptr); 106 } 107 108 // CHECK64-LABEL: @test_vld1_bf16_x3( 109 // CHECK64-NEXT: entry: 110 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr [[PTR:%.*]]) 111 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 112 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 113 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 114 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 115 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 116 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 117 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 118 // 119 // CHECK32-LABEL: @test_vld1_bf16_x3( 120 // CHECK32-NEXT: entry: 121 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr [[PTR:%.*]]) 122 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 123 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 124 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 125 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 126 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 127 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32> 128 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 129 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 130 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2 131 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 132 // 133 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) { 134 return vld1_bf16_x3(ptr); 135 } 136 137 // CHECK64-LABEL: @test_vld1q_bf16_x3( 138 // CHECK64-NEXT: entry: 139 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr [[PTR:%.*]]) 140 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 141 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 142 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 143 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 144 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 145 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 146 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 147 // 148 // CHECK32-LABEL: @test_vld1q_bf16_x3( 149 // CHECK32-NEXT: entry: 150 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr [[PTR:%.*]]) 151 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 152 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 153 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 154 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 155 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 156 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32> 157 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 158 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 159 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2 160 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 161 // 162 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) { 163 return vld1q_bf16_x3(ptr); 164 } 165 166 // CHECK64-LABEL: @test_vld1_bf16_x4( 167 // CHECK64-NEXT: entry: 168 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr [[PTR:%.*]]) 169 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 170 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 171 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 172 // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3 173 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 174 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 175 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 176 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 177 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 178 // 179 // CHECK32-LABEL: @test_vld1_bf16_x4( 180 // CHECK32-NEXT: entry: 181 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr [[PTR:%.*]]) 182 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 183 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 184 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 185 // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3 186 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 187 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 188 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32> 189 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <2 x i32> 190 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 191 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 192 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2 193 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP3]], 3 194 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 195 // 196 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) { 197 return vld1_bf16_x4(ptr); 198 } 199 200 // CHECK64-LABEL: @test_vld1q_bf16_x4( 201 // CHECK64-NEXT: entry: 202 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr [[PTR:%.*]]) 203 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 204 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 205 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 206 // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3 207 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 208 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 209 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 210 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 211 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 212 // 213 // CHECK32-LABEL: @test_vld1q_bf16_x4( 214 // CHECK32-NEXT: entry: 215 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr [[PTR:%.*]]) 216 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 217 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 218 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 219 // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3 220 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 221 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 222 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32> 223 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <4 x i32> 224 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 225 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 226 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2 227 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP3]], 3 228 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 229 // 230 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) { 231 return vld1q_bf16_x4(ptr); 232 } 233 234 // CHECK-LABEL: @test_vld1q_dup_bf16( 235 // CHECK-NEXT: entry: 236 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 237 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x bfloat> poison, bfloat [[TMP0]], i64 0 238 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> poison, <8 x i32> zeroinitializer 239 // CHECK-NEXT: ret <8 x bfloat> [[LANE]] 240 // 241 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) { 242 return vld1q_dup_bf16(ptr); 243 } 244 245 // CHECK64-LABEL: @test_vld2_bf16( 246 // CHECK64-NEXT: entry: 247 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr [[PTR:%.*]]) 248 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0 249 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1 250 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 251 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 252 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 253 // 254 // CHECK32-LABEL: @test_vld2_bf16( 255 // CHECK32-NEXT: entry: 256 // CHECK32-NEXT: [[VLD2_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 257 // CHECK32-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 0 258 // CHECK32-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 1 259 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_0_EXTRACT]] to <2 x i32> 260 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_1_EXTRACT]] to <2 x i32> 261 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 262 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 263 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 264 // 265 bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) { 266 return vld2_bf16(ptr); 267 } 268 269 // CHECK64-LABEL: @test_vld2q_bf16( 270 // CHECK64-NEXT: entry: 271 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr [[PTR:%.*]]) 272 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0 273 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1 274 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 275 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 276 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 277 // 278 // CHECK32-LABEL: @test_vld2q_bf16( 279 // CHECK32-NEXT: entry: 280 // CHECK32-NEXT: [[VLD2Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 281 // CHECK32-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 0 282 // CHECK32-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 1 283 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_0_EXTRACT]] to <4 x i32> 284 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_1_EXTRACT]] to <4 x i32> 285 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 286 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 287 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 288 // 289 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) { 290 return vld2q_bf16(ptr); 291 } 292 293 // CHECK64-LABEL: @test_vld2_lane_bf16( 294 // CHECK64-NEXT: entry: 295 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 296 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE]], 1 297 // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 298 // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 0 299 // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 1 300 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 301 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 302 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 303 // 304 // CHECK32-LABEL: @test_vld2_lane_bf16( 305 // CHECK32-NEXT: entry: 306 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE:%.*]], 0 307 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE]], 1 308 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 309 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 310 // CHECK32-NEXT: [[VLD2_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2) 311 // CHECK32-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 0 312 // CHECK32-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 1 313 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 314 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 315 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP3]], 0 316 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP4]], 1 317 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 318 // 319 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) { 320 return vld2_lane_bf16(ptr, src, 1); 321 } 322 323 // CHECK64-LABEL: @test_vld2q_lane_bf16( 324 // CHECK64-NEXT: entry: 325 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 326 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE]], 1 327 // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 328 // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 0 329 // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 1 330 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 331 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 332 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 333 // 334 // CHECK32-LABEL: @test_vld2q_lane_bf16( 335 // CHECK32-NEXT: entry: 336 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE:%.*]], 0 337 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE]], 1 338 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 339 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 340 // CHECK32-NEXT: [[VLD2Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2) 341 // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 0 342 // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 1 343 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 344 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 345 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP3]], 0 346 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP4]], 1 347 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 348 // 349 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) { 350 return vld2q_lane_bf16(ptr, src, 7); 351 } 352 353 // CHECK64-LABEL: @test_vld3_bf16( 354 // CHECK64-NEXT: entry: 355 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr [[PTR:%.*]]) 356 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0 357 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1 358 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2 359 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 360 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 361 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 362 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 363 // 364 // CHECK32-LABEL: @test_vld3_bf16( 365 // CHECK32-NEXT: entry: 366 // CHECK32-NEXT: [[VLD3_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 367 // CHECK32-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 0 368 // CHECK32-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 1 369 // CHECK32-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 2 370 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_0_EXTRACT]] to <2 x i32> 371 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_1_EXTRACT]] to <2 x i32> 372 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_2_EXTRACT]] to <2 x i32> 373 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 374 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 375 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 376 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 377 // 378 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) { 379 return vld3_bf16(ptr); 380 } 381 382 // CHECK64-LABEL: @test_vld3q_bf16( 383 // CHECK64-NEXT: entry: 384 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr [[PTR:%.*]]) 385 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0 386 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1 387 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2 388 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 389 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 390 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 391 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 392 // 393 // CHECK32-LABEL: @test_vld3q_bf16( 394 // CHECK32-NEXT: entry: 395 // CHECK32-NEXT: [[VLD3Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 396 // CHECK32-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 0 397 // CHECK32-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 1 398 // CHECK32-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 2 399 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_0_EXTRACT]] to <4 x i32> 400 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_1_EXTRACT]] to <4 x i32> 401 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_2_EXTRACT]] to <4 x i32> 402 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 403 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 404 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 405 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 406 // 407 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) { 408 return vld3q_bf16(ptr); 409 } 410 411 // CHECK64-LABEL: @test_vld3_lane_bf16( 412 // CHECK64-NEXT: entry: 413 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 414 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 1 415 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 2 416 // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 417 // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 0 418 // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 1 419 // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 2 420 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 421 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 422 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 423 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 424 // 425 // CHECK32-LABEL: @test_vld3_lane_bf16( 426 // CHECK32-NEXT: entry: 427 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE:%.*]], 0 428 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 1 429 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 2 430 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 431 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 432 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 433 // CHECK32-NEXT: [[VLD3_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2) 434 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 0 435 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 1 436 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 2 437 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 438 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 439 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_2_EXTRACT]] to <2 x i32> 440 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP4]], 0 441 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP5]], 1 442 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP6]], 2 443 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 444 // 445 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) { 446 return vld3_lane_bf16(ptr, src, 1); 447 } 448 449 // CHECK64-LABEL: @test_vld3q_lane_bf16( 450 // CHECK64-NEXT: entry: 451 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 452 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 1 453 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 2 454 // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 455 // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 0 456 // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 1 457 // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 2 458 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 459 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 460 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 461 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 462 // 463 // CHECK32-LABEL: @test_vld3q_lane_bf16( 464 // CHECK32-NEXT: entry: 465 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE:%.*]], 0 466 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 1 467 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 2 468 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 469 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 470 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 471 // CHECK32-NEXT: [[VLD3Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2) 472 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 0 473 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 1 474 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 2 475 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 476 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 477 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32> 478 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP4]], 0 479 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP5]], 1 480 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP6]], 2 481 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 482 // 483 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { 484 return vld3q_lane_bf16(ptr, src, 7); 485 // return vld3q_lane_bf16(ptr, src, 8); 486 } 487 488 // CHECK64-LABEL: @test_vld4_bf16( 489 // CHECK64-NEXT: entry: 490 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr [[PTR:%.*]]) 491 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0 492 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1 493 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2 494 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3 495 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 496 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 497 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 498 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 499 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 500 // 501 // CHECK32-LABEL: @test_vld4_bf16( 502 // CHECK32-NEXT: entry: 503 // CHECK32-NEXT: [[VLD4_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 504 // CHECK32-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 0 505 // CHECK32-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 1 506 // CHECK32-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 2 507 // CHECK32-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 3 508 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_0_EXTRACT]] to <2 x i32> 509 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_1_EXTRACT]] to <2 x i32> 510 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_2_EXTRACT]] to <2 x i32> 511 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_3_EXTRACT]] to <2 x i32> 512 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 513 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 514 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 515 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3 516 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 517 // 518 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) { 519 return vld4_bf16(ptr); 520 } 521 522 // CHECK64-LABEL: @test_vld4q_bf16( 523 // CHECK64-NEXT: entry: 524 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr [[PTR:%.*]]) 525 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0 526 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1 527 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2 528 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3 529 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 530 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 531 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 532 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 533 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 534 // 535 // CHECK32-LABEL: @test_vld4q_bf16( 536 // CHECK32-NEXT: entry: 537 // CHECK32-NEXT: [[VLD4Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 538 // CHECK32-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 0 539 // CHECK32-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 1 540 // CHECK32-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 2 541 // CHECK32-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 3 542 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_0_EXTRACT]] to <4 x i32> 543 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_1_EXTRACT]] to <4 x i32> 544 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_2_EXTRACT]] to <4 x i32> 545 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_3_EXTRACT]] to <4 x i32> 546 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 547 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 548 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 549 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3 550 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 551 // 552 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) { 553 return vld4q_bf16(ptr); 554 } 555 556 // CHECK64-LABEL: @test_vld4_lane_bf16( 557 // CHECK64-NEXT: entry: 558 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 559 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 1 560 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 2 561 // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 3 562 // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 563 // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 0 564 // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 1 565 // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 2 566 // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 3 567 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 568 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 569 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 570 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 571 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 572 // 573 // CHECK32-LABEL: @test_vld4_lane_bf16( 574 // CHECK32-NEXT: entry: 575 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE:%.*]], 0 576 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 1 577 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 2 578 // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 3 579 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 580 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 581 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 582 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 583 // CHECK32-NEXT: [[VLD4_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2) 584 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 0 585 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 1 586 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 2 587 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 3 588 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 589 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 590 // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_2_EXTRACT]] to <2 x i32> 591 // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_3_EXTRACT]] to <2 x i32> 592 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP5]], 0 593 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP6]], 1 594 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP7]], 2 595 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP8]], 3 596 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 597 // 598 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) { 599 return vld4_lane_bf16(ptr, src, 1); 600 } 601 602 // CHECK64-LABEL: @test_vld4q_lane_bf16( 603 // CHECK64-NEXT: entry: 604 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 605 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 1 606 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 2 607 // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 3 608 // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 609 // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 0 610 // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 1 611 // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 2 612 // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 3 613 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 614 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 615 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 616 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 617 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 618 // 619 // CHECK32-LABEL: @test_vld4q_lane_bf16( 620 // CHECK32-NEXT: entry: 621 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE:%.*]], 0 622 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 1 623 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 2 624 // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 3 625 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 626 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 627 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 628 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 629 // CHECK32-NEXT: [[VLD4Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2) 630 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 0 631 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 1 632 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 2 633 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 3 634 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 635 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 636 // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32> 637 // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_3_EXTRACT]] to <4 x i32> 638 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP5]], 0 639 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP6]], 1 640 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP7]], 2 641 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP8]], 3 642 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 643 // 644 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { 645 return vld4q_lane_bf16(ptr, src, 7); 646 } 647 648 // CHECK64-LABEL: @test_vld2_dup_bf16( 649 // CHECK64-NEXT: entry: 650 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr [[PTR:%.*]]) 651 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0 652 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1 653 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 654 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 655 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 656 // 657 // CHECK32-LABEL: @test_vld2_dup_bf16( 658 // CHECK32-NEXT: entry: 659 // CHECK32-NEXT: [[VLD2_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 660 // CHECK32-NEXT: [[VLD2_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 0 661 // CHECK32-NEXT: [[VLD2_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 1 662 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 663 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 664 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 665 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 666 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 667 // 668 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) { 669 return vld2_dup_bf16(ptr); 670 } 671 672 // CHECK64-LABEL: @test_vld2q_dup_bf16( 673 // CHECK64-NEXT: entry: 674 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr [[PTR:%.*]]) 675 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0 676 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1 677 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 678 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 679 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 680 // 681 // CHECK32-LABEL: @test_vld2q_dup_bf16( 682 // CHECK32-NEXT: entry: 683 // CHECK32-NEXT: [[VLD2Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 684 // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 0 685 // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 1 686 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 687 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 688 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 689 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 690 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 691 // 692 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) { 693 return vld2q_dup_bf16(ptr); 694 } 695 696 // CHECK64-LABEL: @test_vld3_dup_bf16( 697 // CHECK64-NEXT: entry: 698 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr [[PTR:%.*]]) 699 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0 700 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1 701 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2 702 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 703 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 704 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 705 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 706 // 707 // CHECK32-LABEL: @test_vld3_dup_bf16( 708 // CHECK32-NEXT: entry: 709 // CHECK32-NEXT: [[VLD3_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 710 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 0 711 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 1 712 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 2 713 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 714 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 715 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_2_EXTRACT]] to <2 x i32> 716 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 717 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 718 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 719 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 720 // 721 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) { 722 return vld3_dup_bf16(ptr); 723 } 724 725 // CHECK64-LABEL: @test_vld3q_dup_bf16( 726 // CHECK64-NEXT: entry: 727 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr [[PTR:%.*]]) 728 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0 729 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1 730 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2 731 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 732 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 733 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 734 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 735 // 736 // CHECK32-LABEL: @test_vld3q_dup_bf16( 737 // CHECK32-NEXT: entry: 738 // CHECK32-NEXT: [[VLD3Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 739 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 0 740 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 1 741 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 2 742 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 743 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 744 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32> 745 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 746 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 747 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 748 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 749 // 750 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) { 751 return vld3q_dup_bf16(ptr); 752 } 753 754 // CHECK64-LABEL: @test_vld4_dup_bf16( 755 // CHECK64-NEXT: entry: 756 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr [[PTR:%.*]]) 757 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0 758 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1 759 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2 760 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3 761 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 762 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 763 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 764 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 765 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 766 // 767 // CHECK32-LABEL: @test_vld4_dup_bf16( 768 // CHECK32-NEXT: entry: 769 // CHECK32-NEXT: [[VLD4_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 770 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 0 771 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 1 772 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 2 773 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 3 774 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 775 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 776 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_2_EXTRACT]] to <2 x i32> 777 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_3_EXTRACT]] to <2 x i32> 778 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 779 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 780 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 781 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3 782 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 783 // 784 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) { 785 return vld4_dup_bf16(ptr); 786 } 787 788 // CHECK64-LABEL: @test_vld4q_dup_bf16( 789 // CHECK64-NEXT: entry: 790 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr [[PTR:%.*]]) 791 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0 792 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1 793 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2 794 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3 795 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 796 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 797 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 798 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 799 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 800 // 801 // CHECK32-LABEL: @test_vld4q_dup_bf16( 802 // CHECK32-NEXT: entry: 803 // CHECK32-NEXT: [[VLD4Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 804 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 0 805 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 1 806 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 2 807 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 3 808 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 809 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 810 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32> 811 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_3_EXTRACT]] to <4 x i32> 812 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 813 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 814 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 815 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3 816 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 817 // 818 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) { 819 return vld4q_dup_bf16(ptr); 820 } 821 822 // CHECK64-LABEL: @test_vst1_bf16( 823 // CHECK64-NEXT: entry: 824 // CHECK64-NEXT: store <4 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2 825 // CHECK64-NEXT: ret void 826 // 827 // CHECK32-LABEL: @test_vst1_bf16( 828 // CHECK32-NEXT: entry: 829 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[VAL:%.*]], i32 2) 830 // CHECK32-NEXT: ret void 831 // 832 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) { 833 vst1_bf16(ptr, val); 834 } 835 836 // CHECK64-LABEL: @test_vst1q_bf16( 837 // CHECK64-NEXT: entry: 838 // CHECK64-NEXT: store <8 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2 839 // CHECK64-NEXT: ret void 840 // 841 // CHECK32-LABEL: @test_vst1q_bf16( 842 // CHECK32-NEXT: entry: 843 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[VAL:%.*]], i32 2) 844 // CHECK32-NEXT: ret void 845 // 846 void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) { 847 vst1q_bf16(ptr, val); 848 } 849 850 // CHECK-LABEL: @test_vst1_lane_bf16( 851 // CHECK-NEXT: entry: 852 // CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x bfloat> [[VAL:%.*]], i64 1 853 // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2 854 // CHECK-NEXT: ret void 855 // 856 void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) { 857 vst1_lane_bf16(ptr, val, 1); 858 } 859 860 // CHECK-LABEL: @test_vst1q_lane_bf16( 861 // CHECK-NEXT: entry: 862 // CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x bfloat> [[VAL:%.*]], i64 7 863 // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2 864 // CHECK-NEXT: ret void 865 // 866 void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) { 867 vst1q_lane_bf16(ptr, val, 7); 868 } 869 870 // CHECK64-LABEL: @test_vst1_bf16_x2( 871 // CHECK64-NEXT: entry: 872 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 873 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 874 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 875 // CHECK64-NEXT: ret void 876 // 877 // CHECK32-LABEL: @test_vst1_bf16_x2( 878 // CHECK32-NEXT: entry: 879 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 880 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 881 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 882 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 883 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]]) 884 // CHECK32-NEXT: ret void 885 // 886 void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) { 887 vst1_bf16_x2(ptr, val); 888 } 889 890 // CHECK64-LABEL: @test_vst1q_bf16_x2( 891 // CHECK64-NEXT: entry: 892 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 893 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 894 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 895 // CHECK64-NEXT: ret void 896 // 897 // CHECK32-LABEL: @test_vst1q_bf16_x2( 898 // CHECK32-NEXT: entry: 899 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 900 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 901 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 902 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 903 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]]) 904 // CHECK32-NEXT: ret void 905 // 906 void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) { 907 vst1q_bf16_x2(ptr, val); 908 } 909 910 // CHECK64-LABEL: @test_vst1_bf16_x3( 911 // CHECK64-NEXT: entry: 912 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 913 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 914 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 915 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 916 // CHECK64-NEXT: ret void 917 // 918 // CHECK32-LABEL: @test_vst1_bf16_x3( 919 // CHECK32-NEXT: entry: 920 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 921 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 922 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 923 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 924 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 925 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 926 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]]) 927 // CHECK32-NEXT: ret void 928 // 929 void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) { 930 vst1_bf16_x3(ptr, val); 931 } 932 933 // CHECK64-LABEL: @test_vst1q_bf16_x3( 934 // CHECK64-NEXT: entry: 935 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 936 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 937 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 938 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 939 // CHECK64-NEXT: ret void 940 // 941 // CHECK32-LABEL: @test_vst1q_bf16_x3( 942 // CHECK32-NEXT: entry: 943 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 944 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 945 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 946 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 947 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 948 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 949 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]]) 950 // CHECK32-NEXT: ret void 951 // 952 void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) { 953 vst1q_bf16_x3(ptr, val); 954 } 955 956 // CHECK64-LABEL: @test_vst1_bf16_x4( 957 // CHECK64-NEXT: entry: 958 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 959 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 960 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 961 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 962 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 963 // CHECK64-NEXT: ret void 964 // 965 // CHECK32-LABEL: @test_vst1_bf16_x4( 966 // CHECK32-NEXT: entry: 967 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 968 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 969 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 970 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 971 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 972 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 973 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 974 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 975 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]]) 976 // CHECK32-NEXT: ret void 977 // 978 void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) { 979 vst1_bf16_x4(ptr, val); 980 } 981 982 // CHECK64-LABEL: @test_vst1q_bf16_x4( 983 // CHECK64-NEXT: entry: 984 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 985 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 986 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 987 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 988 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 989 // CHECK64-NEXT: ret void 990 // 991 // CHECK32-LABEL: @test_vst1q_bf16_x4( 992 // CHECK32-NEXT: entry: 993 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 994 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 995 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 996 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 997 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 998 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 999 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1000 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1001 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]]) 1002 // CHECK32-NEXT: ret void 1003 // 1004 void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) { 1005 vst1q_bf16_x4(ptr, val); 1006 } 1007 1008 // CHECK64-LABEL: @test_vst2_bf16( 1009 // CHECK64-NEXT: entry: 1010 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1011 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 1012 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 1013 // CHECK64-NEXT: ret void 1014 // 1015 // CHECK32-LABEL: @test_vst2_bf16( 1016 // CHECK32-NEXT: entry: 1017 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1018 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 1019 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1020 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1021 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 2) 1022 // CHECK32-NEXT: ret void 1023 // 1024 void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { 1025 vst2_bf16(ptr, val); 1026 } 1027 1028 // CHECK64-LABEL: @test_vst2q_bf16( 1029 // CHECK64-NEXT: entry: 1030 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1031 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 1032 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 1033 // CHECK64-NEXT: ret void 1034 // 1035 // CHECK32-LABEL: @test_vst2q_bf16( 1036 // CHECK32-NEXT: entry: 1037 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1038 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 1039 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1040 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1041 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 2) 1042 // CHECK32-NEXT: ret void 1043 // 1044 void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { 1045 vst2q_bf16(ptr, val); 1046 } 1047 1048 // CHECK64-LABEL: @test_vst2_lane_bf16( 1049 // CHECK64-NEXT: entry: 1050 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1051 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 1052 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1053 // CHECK64-NEXT: ret void 1054 // 1055 // CHECK32-LABEL: @test_vst2_lane_bf16( 1056 // CHECK32-NEXT: entry: 1057 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1058 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 1059 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1060 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1061 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2) 1062 // CHECK32-NEXT: ret void 1063 // 1064 void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { 1065 vst2_lane_bf16(ptr, val, 1); 1066 } 1067 1068 // CHECK64-LABEL: @test_vst2q_lane_bf16( 1069 // CHECK64-NEXT: entry: 1070 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1071 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 1072 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1073 // CHECK64-NEXT: ret void 1074 // 1075 // CHECK32-LABEL: @test_vst2q_lane_bf16( 1076 // CHECK32-NEXT: entry: 1077 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1078 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 1079 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1080 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1081 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2) 1082 // CHECK32-NEXT: ret void 1083 // 1084 void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { 1085 vst2q_lane_bf16(ptr, val, 7); 1086 } 1087 1088 // CHECK64-LABEL: @test_vst3_bf16( 1089 // CHECK64-NEXT: entry: 1090 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1091 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 1092 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 1093 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 1094 // CHECK64-NEXT: ret void 1095 // 1096 // CHECK32-LABEL: @test_vst3_bf16( 1097 // CHECK32-NEXT: entry: 1098 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1099 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 1100 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 1101 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1102 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1103 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1104 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 2) 1105 // CHECK32-NEXT: ret void 1106 // 1107 void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { 1108 vst3_bf16(ptr, val); 1109 } 1110 1111 // CHECK64-LABEL: @test_vst3q_bf16( 1112 // CHECK64-NEXT: entry: 1113 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1114 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 1115 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 1116 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 1117 // CHECK64-NEXT: ret void 1118 // 1119 // CHECK32-LABEL: @test_vst3q_bf16( 1120 // CHECK32-NEXT: entry: 1121 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1122 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 1123 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 1124 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1125 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1126 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1127 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 2) 1128 // CHECK32-NEXT: ret void 1129 // 1130 void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { 1131 vst3q_bf16(ptr, val); 1132 } 1133 1134 // CHECK64-LABEL: @test_vst3_lane_bf16( 1135 // CHECK64-NEXT: entry: 1136 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1137 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 1138 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 1139 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1140 // CHECK64-NEXT: ret void 1141 // 1142 // CHECK32-LABEL: @test_vst3_lane_bf16( 1143 // CHECK32-NEXT: entry: 1144 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1145 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 1146 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 1147 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1148 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1149 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1150 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2) 1151 // CHECK32-NEXT: ret void 1152 // 1153 void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { 1154 vst3_lane_bf16(ptr, val, 1); 1155 } 1156 1157 // CHECK64-LABEL: @test_vst3q_lane_bf16( 1158 // CHECK64-NEXT: entry: 1159 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1160 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 1161 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 1162 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1163 // CHECK64-NEXT: ret void 1164 // 1165 // CHECK32-LABEL: @test_vst3q_lane_bf16( 1166 // CHECK32-NEXT: entry: 1167 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1168 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 1169 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 1170 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1171 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1172 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1173 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2) 1174 // CHECK32-NEXT: ret void 1175 // 1176 void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { 1177 vst3q_lane_bf16(ptr, val, 7); 1178 } 1179 1180 // CHECK64-LABEL: @test_vst4_bf16( 1181 // CHECK64-NEXT: entry: 1182 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1183 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 1184 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 1185 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 1186 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 1187 // CHECK64-NEXT: ret void 1188 // 1189 // CHECK32-LABEL: @test_vst4_bf16( 1190 // CHECK32-NEXT: entry: 1191 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1192 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 1193 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 1194 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 1195 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1196 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1197 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1198 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 1199 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 2) 1200 // CHECK32-NEXT: ret void 1201 // 1202 void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { 1203 vst4_bf16(ptr, val); 1204 } 1205 1206 // CHECK64-LABEL: @test_vst4q_bf16( 1207 // CHECK64-NEXT: entry: 1208 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1209 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 1210 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 1211 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 1212 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 1213 // CHECK64-NEXT: ret void 1214 // 1215 // CHECK32-LABEL: @test_vst4q_bf16( 1216 // CHECK32-NEXT: entry: 1217 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1218 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 1219 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 1220 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 1221 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1222 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1223 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1224 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1225 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 2) 1226 // CHECK32-NEXT: ret void 1227 // 1228 void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { 1229 vst4q_bf16(ptr, val); 1230 } 1231 1232 // CHECK64-LABEL: @test_vst4_lane_bf16( 1233 // CHECK64-NEXT: entry: 1234 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1235 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 1236 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 1237 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 1238 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1239 // CHECK64-NEXT: ret void 1240 // 1241 // CHECK32-LABEL: @test_vst4_lane_bf16( 1242 // CHECK32-NEXT: entry: 1243 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1244 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 1245 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 1246 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 1247 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1248 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1249 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1250 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 1251 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2) 1252 // CHECK32-NEXT: ret void 1253 // 1254 void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { 1255 vst4_lane_bf16(ptr, val, 1); 1256 } 1257 1258 // CHECK64-LABEL: @test_vst4q_lane_bf16( 1259 // CHECK64-NEXT: entry: 1260 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1261 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 1262 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 1263 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 1264 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1265 // CHECK64-NEXT: ret void 1266 // 1267 // CHECK32-LABEL: @test_vst4q_lane_bf16( 1268 // CHECK32-NEXT: entry: 1269 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1270 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 1271 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 1272 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 1273 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1274 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1275 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1276 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1277 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2) 1278 // CHECK32-NEXT: ret void 1279 // 1280 void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { 1281 vst4q_lane_bf16(ptr, val, 7); 1282 } 1283