1*207e5cccSFangrui Song // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 2*207e5cccSFangrui Song // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ 3*207e5cccSFangrui Song // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 4*207e5cccSFangrui Song // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ 5*207e5cccSFangrui Song // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 6*207e5cccSFangrui Song 7*207e5cccSFangrui Song // REQUIRES: arm-registered-target,aarch64-registered-target 8*207e5cccSFangrui Song 9*207e5cccSFangrui Song #include "arm_neon.h" 10*207e5cccSFangrui Song 11*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_bf16( 12*207e5cccSFangrui Song // CHECK-NEXT: entry: 13*207e5cccSFangrui Song // CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr [[PTR:%.*]], align 2 14*207e5cccSFangrui Song // CHECK-NEXT: ret <4 x bfloat> [[TMP1]] 15*207e5cccSFangrui Song // 16*207e5cccSFangrui Song bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) { 17*207e5cccSFangrui Song return vld1_bf16(ptr); 18*207e5cccSFangrui Song } 19*207e5cccSFangrui Song 20*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_bf16( 21*207e5cccSFangrui Song // CHECK-NEXT: entry: 22*207e5cccSFangrui Song // CHECK-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, ptr [[PTR:%.*]], align 2 23*207e5cccSFangrui Song // CHECK-NEXT: ret <8 x bfloat> [[TMP1]] 24*207e5cccSFangrui Song // 25*207e5cccSFangrui Song bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) { 26*207e5cccSFangrui Song return vld1q_bf16(ptr); 27*207e5cccSFangrui Song } 28*207e5cccSFangrui Song 29*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_lane_bf16( 30*207e5cccSFangrui Song // CHECK-NEXT: entry: 31*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 32*207e5cccSFangrui Song // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 0 33*207e5cccSFangrui Song // CHECK-NEXT: ret <4 x bfloat> [[VLD1_LANE]] 34*207e5cccSFangrui Song // 35*207e5cccSFangrui Song bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) { 36*207e5cccSFangrui Song return vld1_lane_bf16(ptr, src, 0); 37*207e5cccSFangrui Song } 38*207e5cccSFangrui Song 39*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_lane_bf16( 40*207e5cccSFangrui Song // CHECK-NEXT: entry: 41*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 42*207e5cccSFangrui Song // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 7 43*207e5cccSFangrui Song // CHECK-NEXT: ret <8 x bfloat> [[VLD1_LANE]] 44*207e5cccSFangrui Song // 45*207e5cccSFangrui Song bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) { 46*207e5cccSFangrui Song return vld1q_lane_bf16(ptr, src, 7); 47*207e5cccSFangrui Song } 48*207e5cccSFangrui Song 49*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1_dup_bf16( 50*207e5cccSFangrui Song // CHECK-NEXT: entry: 51*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 52*207e5cccSFangrui Song // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x bfloat> poison, bfloat [[TMP0]], i64 0 53*207e5cccSFangrui Song // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <4 x i32> zeroinitializer 54*207e5cccSFangrui Song // CHECK-NEXT: ret <4 x bfloat> [[LANE]] 55*207e5cccSFangrui Song // 56*207e5cccSFangrui Song bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) { 57*207e5cccSFangrui Song return vld1_dup_bf16(ptr); 58*207e5cccSFangrui Song } 59*207e5cccSFangrui Song 60*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x2( 61*207e5cccSFangrui Song // CHECK64-NEXT: entry: 62*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr [[PTR:%.*]]) 63*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 64*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 65*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 66*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 67*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 68*207e5cccSFangrui Song // 69*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x2( 70*207e5cccSFangrui Song // CHECK32-NEXT: entry: 71*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr [[PTR:%.*]]) 72*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 73*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 74*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 75*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 76*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 77*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 78*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 79*207e5cccSFangrui Song // 80*207e5cccSFangrui Song bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) { 81*207e5cccSFangrui Song return vld1_bf16_x2(ptr); 82*207e5cccSFangrui Song } 83*207e5cccSFangrui Song 84*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x2( 85*207e5cccSFangrui Song // CHECK64-NEXT: entry: 86*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr [[PTR:%.*]]) 87*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 88*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 89*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 90*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 91*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 92*207e5cccSFangrui Song // 93*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x2( 94*207e5cccSFangrui Song // CHECK32-NEXT: entry: 95*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr [[PTR:%.*]]) 96*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 97*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 98*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 99*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 100*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 101*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 102*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 103*207e5cccSFangrui Song // 104*207e5cccSFangrui Song bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) { 105*207e5cccSFangrui Song return vld1q_bf16_x2(ptr); 106*207e5cccSFangrui Song } 107*207e5cccSFangrui Song 108*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x3( 109*207e5cccSFangrui Song // CHECK64-NEXT: entry: 110*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr [[PTR:%.*]]) 111*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 112*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 113*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 114*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 115*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 116*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 117*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 118*207e5cccSFangrui Song // 119*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x3( 120*207e5cccSFangrui Song // CHECK32-NEXT: entry: 121*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr [[PTR:%.*]]) 122*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 123*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 124*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 125*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 126*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 127*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32> 128*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 129*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 130*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2 131*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 132*207e5cccSFangrui Song // 133*207e5cccSFangrui Song bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) { 134*207e5cccSFangrui Song return vld1_bf16_x3(ptr); 135*207e5cccSFangrui Song } 136*207e5cccSFangrui Song 137*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x3( 138*207e5cccSFangrui Song // CHECK64-NEXT: entry: 139*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr [[PTR:%.*]]) 140*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 141*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 142*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 143*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 144*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 145*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 146*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 147*207e5cccSFangrui Song // 148*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x3( 149*207e5cccSFangrui Song // CHECK32-NEXT: entry: 150*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr [[PTR:%.*]]) 151*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 152*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 153*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 154*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 155*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 156*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32> 157*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 158*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 159*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2 160*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 161*207e5cccSFangrui Song // 162*207e5cccSFangrui Song bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) { 163*207e5cccSFangrui Song return vld1q_bf16_x3(ptr); 164*207e5cccSFangrui Song } 165*207e5cccSFangrui Song 166*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1_bf16_x4( 167*207e5cccSFangrui Song // CHECK64-NEXT: entry: 168*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr [[PTR:%.*]]) 169*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 170*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 171*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 172*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3 173*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 174*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 175*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 176*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 177*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 178*207e5cccSFangrui Song // 179*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1_bf16_x4( 180*207e5cccSFangrui Song // CHECK32-NEXT: entry: 181*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr [[PTR:%.*]]) 182*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0 183*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1 184*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2 185*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3 186*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32> 187*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32> 188*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32> 189*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <2 x i32> 190*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0 191*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1 192*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2 193*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP3]], 3 194*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 195*207e5cccSFangrui Song // 196*207e5cccSFangrui Song bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) { 197*207e5cccSFangrui Song return vld1_bf16_x4(ptr); 198*207e5cccSFangrui Song } 199*207e5cccSFangrui Song 200*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld1q_bf16_x4( 201*207e5cccSFangrui Song // CHECK64-NEXT: entry: 202*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr [[PTR:%.*]]) 203*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 204*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 205*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 206*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3 207*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 208*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 209*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 210*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 211*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 212*207e5cccSFangrui Song // 213*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld1q_bf16_x4( 214*207e5cccSFangrui Song // CHECK32-NEXT: entry: 215*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr [[PTR:%.*]]) 216*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0 217*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1 218*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2 219*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3 220*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32> 221*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32> 222*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32> 223*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <4 x i32> 224*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0 225*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1 226*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2 227*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP3]], 3 228*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 229*207e5cccSFangrui Song // 230*207e5cccSFangrui Song bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) { 231*207e5cccSFangrui Song return vld1q_bf16_x4(ptr); 232*207e5cccSFangrui Song } 233*207e5cccSFangrui Song 234*207e5cccSFangrui Song // CHECK-LABEL: @test_vld1q_dup_bf16( 235*207e5cccSFangrui Song // CHECK-NEXT: entry: 236*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2 237*207e5cccSFangrui Song // CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x bfloat> poison, bfloat [[TMP0]], i64 0 238*207e5cccSFangrui Song // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> poison, <8 x i32> zeroinitializer 239*207e5cccSFangrui Song // CHECK-NEXT: ret <8 x bfloat> [[LANE]] 240*207e5cccSFangrui Song // 241*207e5cccSFangrui Song bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) { 242*207e5cccSFangrui Song return vld1q_dup_bf16(ptr); 243*207e5cccSFangrui Song } 244*207e5cccSFangrui Song 245*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_bf16( 246*207e5cccSFangrui Song // CHECK64-NEXT: entry: 247*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr [[PTR:%.*]]) 248*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0 249*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1 250*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 251*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 252*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 253*207e5cccSFangrui Song // 254*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_bf16( 255*207e5cccSFangrui Song // CHECK32-NEXT: entry: 256*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 257*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 0 258*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 1 259*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_0_EXTRACT]] to <2 x i32> 260*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_1_EXTRACT]] to <2 x i32> 261*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 262*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 263*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 264*207e5cccSFangrui Song // 265*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) { 266*207e5cccSFangrui Song return vld2_bf16(ptr); 267*207e5cccSFangrui Song } 268*207e5cccSFangrui Song 269*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_bf16( 270*207e5cccSFangrui Song // CHECK64-NEXT: entry: 271*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr [[PTR:%.*]]) 272*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0 273*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1 274*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 275*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 276*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 277*207e5cccSFangrui Song // 278*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_bf16( 279*207e5cccSFangrui Song // CHECK32-NEXT: entry: 280*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 281*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 0 282*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 1 283*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_0_EXTRACT]] to <4 x i32> 284*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_1_EXTRACT]] to <4 x i32> 285*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 286*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 287*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 288*207e5cccSFangrui Song // 289*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) { 290*207e5cccSFangrui Song return vld2q_bf16(ptr); 291*207e5cccSFangrui Song } 292*207e5cccSFangrui Song 293*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_lane_bf16( 294*207e5cccSFangrui Song // CHECK64-NEXT: entry: 295*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 296*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE]], 1 297*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 298*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 0 299*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 1 300*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 301*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 302*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 303*207e5cccSFangrui Song // 304*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_lane_bf16( 305*207e5cccSFangrui Song // CHECK32-NEXT: entry: 306*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE:%.*]], 0 307*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE]], 1 308*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 309*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 310*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2) 311*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 0 312*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 1 313*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 314*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 315*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP3]], 0 316*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP4]], 1 317*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 318*207e5cccSFangrui Song // 319*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) { 320*207e5cccSFangrui Song return vld2_lane_bf16(ptr, src, 1); 321*207e5cccSFangrui Song } 322*207e5cccSFangrui Song 323*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_lane_bf16( 324*207e5cccSFangrui Song // CHECK64-NEXT: entry: 325*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 326*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE]], 1 327*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 328*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 0 329*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 1 330*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 331*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 332*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 333*207e5cccSFangrui Song // 334*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_lane_bf16( 335*207e5cccSFangrui Song // CHECK32-NEXT: entry: 336*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE:%.*]], 0 337*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE]], 1 338*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 339*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 340*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2) 341*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 0 342*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 1 343*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 344*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 345*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP3]], 0 346*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP4]], 1 347*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 348*207e5cccSFangrui Song // 349*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) { 350*207e5cccSFangrui Song return vld2q_lane_bf16(ptr, src, 7); 351*207e5cccSFangrui Song } 352*207e5cccSFangrui Song 353*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_bf16( 354*207e5cccSFangrui Song // CHECK64-NEXT: entry: 355*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr [[PTR:%.*]]) 356*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0 357*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1 358*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2 359*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 360*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 361*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 362*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 363*207e5cccSFangrui Song // 364*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_bf16( 365*207e5cccSFangrui Song // CHECK32-NEXT: entry: 366*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 367*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 0 368*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 1 369*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 2 370*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_0_EXTRACT]] to <2 x i32> 371*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_1_EXTRACT]] to <2 x i32> 372*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_2_EXTRACT]] to <2 x i32> 373*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 374*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 375*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 376*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 377*207e5cccSFangrui Song // 378*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) { 379*207e5cccSFangrui Song return vld3_bf16(ptr); 380*207e5cccSFangrui Song } 381*207e5cccSFangrui Song 382*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_bf16( 383*207e5cccSFangrui Song // CHECK64-NEXT: entry: 384*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr [[PTR:%.*]]) 385*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0 386*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1 387*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2 388*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 389*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 390*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 391*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 392*207e5cccSFangrui Song // 393*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_bf16( 394*207e5cccSFangrui Song // CHECK32-NEXT: entry: 395*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 396*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 0 397*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 1 398*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 2 399*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_0_EXTRACT]] to <4 x i32> 400*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_1_EXTRACT]] to <4 x i32> 401*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_2_EXTRACT]] to <4 x i32> 402*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 403*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 404*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 405*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 406*207e5cccSFangrui Song // 407*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) { 408*207e5cccSFangrui Song return vld3q_bf16(ptr); 409*207e5cccSFangrui Song } 410*207e5cccSFangrui Song 411*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_lane_bf16( 412*207e5cccSFangrui Song // CHECK64-NEXT: entry: 413*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 414*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 1 415*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 2 416*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 417*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 0 418*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 1 419*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 2 420*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 421*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 422*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 423*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 424*207e5cccSFangrui Song // 425*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_lane_bf16( 426*207e5cccSFangrui Song // CHECK32-NEXT: entry: 427*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE:%.*]], 0 428*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 1 429*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 2 430*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 431*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 432*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 433*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2) 434*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 0 435*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 1 436*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 2 437*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 438*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 439*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_2_EXTRACT]] to <2 x i32> 440*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP4]], 0 441*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP5]], 1 442*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP6]], 2 443*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 444*207e5cccSFangrui Song // 445*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) { 446*207e5cccSFangrui Song return vld3_lane_bf16(ptr, src, 1); 447*207e5cccSFangrui Song } 448*207e5cccSFangrui Song 449*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_lane_bf16( 450*207e5cccSFangrui Song // CHECK64-NEXT: entry: 451*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 452*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 1 453*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 2 454*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 455*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 0 456*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 1 457*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 2 458*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 459*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 460*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 461*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 462*207e5cccSFangrui Song // 463*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_lane_bf16( 464*207e5cccSFangrui Song // CHECK32-NEXT: entry: 465*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE:%.*]], 0 466*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 1 467*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 2 468*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 469*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 470*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 471*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2) 472*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 0 473*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 1 474*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 2 475*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 476*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 477*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32> 478*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP4]], 0 479*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP5]], 1 480*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP6]], 2 481*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 482*207e5cccSFangrui Song // 483*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { 484*207e5cccSFangrui Song return vld3q_lane_bf16(ptr, src, 7); 485*207e5cccSFangrui Song // return vld3q_lane_bf16(ptr, src, 8); 486*207e5cccSFangrui Song } 487*207e5cccSFangrui Song 488*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_bf16( 489*207e5cccSFangrui Song // CHECK64-NEXT: entry: 490*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr [[PTR:%.*]]) 491*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0 492*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1 493*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2 494*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3 495*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 496*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 497*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 498*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 499*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 500*207e5cccSFangrui Song // 501*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_bf16( 502*207e5cccSFangrui Song // CHECK32-NEXT: entry: 503*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 504*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 0 505*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 1 506*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 2 507*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 3 508*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_0_EXTRACT]] to <2 x i32> 509*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_1_EXTRACT]] to <2 x i32> 510*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_2_EXTRACT]] to <2 x i32> 511*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_3_EXTRACT]] to <2 x i32> 512*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 513*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 514*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 515*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3 516*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 517*207e5cccSFangrui Song // 518*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) { 519*207e5cccSFangrui Song return vld4_bf16(ptr); 520*207e5cccSFangrui Song } 521*207e5cccSFangrui Song 522*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_bf16( 523*207e5cccSFangrui Song // CHECK64-NEXT: entry: 524*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr [[PTR:%.*]]) 525*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0 526*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1 527*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2 528*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3 529*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 530*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 531*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 532*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 533*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 534*207e5cccSFangrui Song // 535*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_bf16( 536*207e5cccSFangrui Song // CHECK32-NEXT: entry: 537*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 538*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 0 539*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 1 540*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 2 541*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 3 542*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_0_EXTRACT]] to <4 x i32> 543*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_1_EXTRACT]] to <4 x i32> 544*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_2_EXTRACT]] to <4 x i32> 545*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_3_EXTRACT]] to <4 x i32> 546*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 547*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 548*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 549*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3 550*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 551*207e5cccSFangrui Song // 552*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) { 553*207e5cccSFangrui Song return vld4q_bf16(ptr); 554*207e5cccSFangrui Song } 555*207e5cccSFangrui Song 556*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_lane_bf16( 557*207e5cccSFangrui Song // CHECK64-NEXT: entry: 558*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0 559*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 1 560*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 2 561*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 3 562*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 563*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 0 564*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 1 565*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 2 566*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 3 567*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 568*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 569*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 570*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 571*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 572*207e5cccSFangrui Song // 573*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_lane_bf16( 574*207e5cccSFangrui Song // CHECK32-NEXT: entry: 575*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE:%.*]], 0 576*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 1 577*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 2 578*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 3 579*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 580*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 581*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 582*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 583*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2) 584*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 0 585*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 1 586*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 2 587*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 3 588*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_0_EXTRACT]] to <2 x i32> 589*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_1_EXTRACT]] to <2 x i32> 590*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_2_EXTRACT]] to <2 x i32> 591*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_3_EXTRACT]] to <2 x i32> 592*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP5]], 0 593*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP6]], 1 594*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP7]], 2 595*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP8]], 3 596*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 597*207e5cccSFangrui Song // 598*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) { 599*207e5cccSFangrui Song return vld4_lane_bf16(ptr, src, 1); 600*207e5cccSFangrui Song } 601*207e5cccSFangrui Song 602*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_lane_bf16( 603*207e5cccSFangrui Song // CHECK64-NEXT: entry: 604*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0 605*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 1 606*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 2 607*207e5cccSFangrui Song // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 3 608*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 609*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 0 610*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 1 611*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 2 612*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 3 613*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 614*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 615*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 616*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 617*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 618*207e5cccSFangrui Song // 619*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_lane_bf16( 620*207e5cccSFangrui Song // CHECK32-NEXT: entry: 621*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE:%.*]], 0 622*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 1 623*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 2 624*207e5cccSFangrui Song // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 3 625*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 626*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 627*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 628*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 629*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2) 630*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 0 631*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 1 632*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 2 633*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 3 634*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32> 635*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32> 636*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32> 637*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_3_EXTRACT]] to <4 x i32> 638*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP5]], 0 639*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP6]], 1 640*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP7]], 2 641*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP8]], 3 642*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 643*207e5cccSFangrui Song // 644*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { 645*207e5cccSFangrui Song return vld4q_lane_bf16(ptr, src, 7); 646*207e5cccSFangrui Song } 647*207e5cccSFangrui Song 648*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2_dup_bf16( 649*207e5cccSFangrui Song // CHECK64-NEXT: entry: 650*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr [[PTR:%.*]]) 651*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0 652*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1 653*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 654*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 655*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] 656*207e5cccSFangrui Song // 657*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2_dup_bf16( 658*207e5cccSFangrui Song // CHECK32-NEXT: entry: 659*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 660*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 0 661*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 1 662*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 663*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 664*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 665*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 666*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]] 667*207e5cccSFangrui Song // 668*207e5cccSFangrui Song bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) { 669*207e5cccSFangrui Song return vld2_dup_bf16(ptr); 670*207e5cccSFangrui Song } 671*207e5cccSFangrui Song 672*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld2q_dup_bf16( 673*207e5cccSFangrui Song // CHECK64-NEXT: entry: 674*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr [[PTR:%.*]]) 675*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0 676*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1 677*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0 678*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1 679*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] 680*207e5cccSFangrui Song // 681*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld2q_dup_bf16( 682*207e5cccSFangrui Song // CHECK32-NEXT: entry: 683*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 684*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 0 685*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 1 686*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 687*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 688*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 689*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 690*207e5cccSFangrui Song // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]] 691*207e5cccSFangrui Song // 692*207e5cccSFangrui Song bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) { 693*207e5cccSFangrui Song return vld2q_dup_bf16(ptr); 694*207e5cccSFangrui Song } 695*207e5cccSFangrui Song 696*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3_dup_bf16( 697*207e5cccSFangrui Song // CHECK64-NEXT: entry: 698*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr [[PTR:%.*]]) 699*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0 700*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1 701*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2 702*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 703*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 704*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 705*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] 706*207e5cccSFangrui Song // 707*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3_dup_bf16( 708*207e5cccSFangrui Song // CHECK32-NEXT: entry: 709*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 710*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 0 711*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 1 712*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 2 713*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 714*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 715*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_2_EXTRACT]] to <2 x i32> 716*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 717*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 718*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 719*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]] 720*207e5cccSFangrui Song // 721*207e5cccSFangrui Song bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) { 722*207e5cccSFangrui Song return vld3_dup_bf16(ptr); 723*207e5cccSFangrui Song } 724*207e5cccSFangrui Song 725*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld3q_dup_bf16( 726*207e5cccSFangrui Song // CHECK64-NEXT: entry: 727*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr [[PTR:%.*]]) 728*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0 729*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1 730*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2 731*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0 732*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1 733*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2 734*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] 735*207e5cccSFangrui Song // 736*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld3q_dup_bf16( 737*207e5cccSFangrui Song // CHECK32-NEXT: entry: 738*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 739*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 0 740*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 1 741*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 2 742*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 743*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 744*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32> 745*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 746*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 747*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 748*207e5cccSFangrui Song // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]] 749*207e5cccSFangrui Song // 750*207e5cccSFangrui Song bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) { 751*207e5cccSFangrui Song return vld3q_dup_bf16(ptr); 752*207e5cccSFangrui Song } 753*207e5cccSFangrui Song 754*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4_dup_bf16( 755*207e5cccSFangrui Song // CHECK64-NEXT: entry: 756*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr [[PTR:%.*]]) 757*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0 758*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1 759*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2 760*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3 761*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 762*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 763*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 764*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 765*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] 766*207e5cccSFangrui Song // 767*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4_dup_bf16( 768*207e5cccSFangrui Song // CHECK32-NEXT: entry: 769*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2) 770*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 0 771*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 1 772*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 2 773*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 3 774*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_0_EXTRACT]] to <2 x i32> 775*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_1_EXTRACT]] to <2 x i32> 776*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_2_EXTRACT]] to <2 x i32> 777*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_3_EXTRACT]] to <2 x i32> 778*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0 779*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1 780*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2 781*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3 782*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]] 783*207e5cccSFangrui Song // 784*207e5cccSFangrui Song bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) { 785*207e5cccSFangrui Song return vld4_dup_bf16(ptr); 786*207e5cccSFangrui Song } 787*207e5cccSFangrui Song 788*207e5cccSFangrui Song // CHECK64-LABEL: @test_vld4q_dup_bf16( 789*207e5cccSFangrui Song // CHECK64-NEXT: entry: 790*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr [[PTR:%.*]]) 791*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0 792*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1 793*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2 794*207e5cccSFangrui Song // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3 795*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0 796*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1 797*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2 798*207e5cccSFangrui Song // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3 799*207e5cccSFangrui Song // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] 800*207e5cccSFangrui Song // 801*207e5cccSFangrui Song // CHECK32-LABEL: @test_vld4q_dup_bf16( 802*207e5cccSFangrui Song // CHECK32-NEXT: entry: 803*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2) 804*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 0 805*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 1 806*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 2 807*207e5cccSFangrui Song // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 3 808*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32> 809*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32> 810*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32> 811*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_3_EXTRACT]] to <4 x i32> 812*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0 813*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1 814*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2 815*207e5cccSFangrui Song // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3 816*207e5cccSFangrui Song // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]] 817*207e5cccSFangrui Song // 818*207e5cccSFangrui Song bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) { 819*207e5cccSFangrui Song return vld4q_dup_bf16(ptr); 820*207e5cccSFangrui Song } 821*207e5cccSFangrui Song 822*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16( 823*207e5cccSFangrui Song // CHECK64-NEXT: entry: 824*207e5cccSFangrui Song // CHECK64-NEXT: store <4 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2 825*207e5cccSFangrui Song // CHECK64-NEXT: ret void 826*207e5cccSFangrui Song // 827*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16( 828*207e5cccSFangrui Song // CHECK32-NEXT: entry: 829*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[VAL:%.*]], i32 2) 830*207e5cccSFangrui Song // CHECK32-NEXT: ret void 831*207e5cccSFangrui Song // 832*207e5cccSFangrui Song void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) { 833*207e5cccSFangrui Song vst1_bf16(ptr, val); 834*207e5cccSFangrui Song } 835*207e5cccSFangrui Song 836*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16( 837*207e5cccSFangrui Song // CHECK64-NEXT: entry: 838*207e5cccSFangrui Song // CHECK64-NEXT: store <8 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2 839*207e5cccSFangrui Song // CHECK64-NEXT: ret void 840*207e5cccSFangrui Song // 841*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16( 842*207e5cccSFangrui Song // CHECK32-NEXT: entry: 843*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[VAL:%.*]], i32 2) 844*207e5cccSFangrui Song // CHECK32-NEXT: ret void 845*207e5cccSFangrui Song // 846*207e5cccSFangrui Song void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) { 847*207e5cccSFangrui Song vst1q_bf16(ptr, val); 848*207e5cccSFangrui Song } 849*207e5cccSFangrui Song 850*207e5cccSFangrui Song // CHECK-LABEL: @test_vst1_lane_bf16( 851*207e5cccSFangrui Song // CHECK-NEXT: entry: 852*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x bfloat> [[VAL:%.*]], i64 1 853*207e5cccSFangrui Song // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2 854*207e5cccSFangrui Song // CHECK-NEXT: ret void 855*207e5cccSFangrui Song // 856*207e5cccSFangrui Song void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) { 857*207e5cccSFangrui Song vst1_lane_bf16(ptr, val, 1); 858*207e5cccSFangrui Song } 859*207e5cccSFangrui Song 860*207e5cccSFangrui Song // CHECK-LABEL: @test_vst1q_lane_bf16( 861*207e5cccSFangrui Song // CHECK-NEXT: entry: 862*207e5cccSFangrui Song // CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x bfloat> [[VAL:%.*]], i64 7 863*207e5cccSFangrui Song // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2 864*207e5cccSFangrui Song // CHECK-NEXT: ret void 865*207e5cccSFangrui Song // 866*207e5cccSFangrui Song void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) { 867*207e5cccSFangrui Song vst1q_lane_bf16(ptr, val, 7); 868*207e5cccSFangrui Song } 869*207e5cccSFangrui Song 870*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x2( 871*207e5cccSFangrui Song // CHECK64-NEXT: entry: 872*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 873*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 874*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 875*207e5cccSFangrui Song // CHECK64-NEXT: ret void 876*207e5cccSFangrui Song // 877*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x2( 878*207e5cccSFangrui Song // CHECK32-NEXT: entry: 879*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 880*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 881*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 882*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 883*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]]) 884*207e5cccSFangrui Song // CHECK32-NEXT: ret void 885*207e5cccSFangrui Song // 886*207e5cccSFangrui Song void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) { 887*207e5cccSFangrui Song vst1_bf16_x2(ptr, val); 888*207e5cccSFangrui Song } 889*207e5cccSFangrui Song 890*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x2( 891*207e5cccSFangrui Song // CHECK64-NEXT: entry: 892*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 893*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 894*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 895*207e5cccSFangrui Song // CHECK64-NEXT: ret void 896*207e5cccSFangrui Song // 897*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x2( 898*207e5cccSFangrui Song // CHECK32-NEXT: entry: 899*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 900*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 901*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 902*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 903*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]]) 904*207e5cccSFangrui Song // CHECK32-NEXT: ret void 905*207e5cccSFangrui Song // 906*207e5cccSFangrui Song void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) { 907*207e5cccSFangrui Song vst1q_bf16_x2(ptr, val); 908*207e5cccSFangrui Song } 909*207e5cccSFangrui Song 910*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x3( 911*207e5cccSFangrui Song // CHECK64-NEXT: entry: 912*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 913*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 914*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 915*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 916*207e5cccSFangrui Song // CHECK64-NEXT: ret void 917*207e5cccSFangrui Song // 918*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x3( 919*207e5cccSFangrui Song // CHECK32-NEXT: entry: 920*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 921*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 922*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 923*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 924*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 925*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 926*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]]) 927*207e5cccSFangrui Song // CHECK32-NEXT: ret void 928*207e5cccSFangrui Song // 929*207e5cccSFangrui Song void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) { 930*207e5cccSFangrui Song vst1_bf16_x3(ptr, val); 931*207e5cccSFangrui Song } 932*207e5cccSFangrui Song 933*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x3( 934*207e5cccSFangrui Song // CHECK64-NEXT: entry: 935*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 936*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 937*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 938*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 939*207e5cccSFangrui Song // CHECK64-NEXT: ret void 940*207e5cccSFangrui Song // 941*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x3( 942*207e5cccSFangrui Song // CHECK32-NEXT: entry: 943*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 944*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 945*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 946*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 947*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 948*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 949*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]]) 950*207e5cccSFangrui Song // CHECK32-NEXT: ret void 951*207e5cccSFangrui Song // 952*207e5cccSFangrui Song void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) { 953*207e5cccSFangrui Song vst1q_bf16_x3(ptr, val); 954*207e5cccSFangrui Song } 955*207e5cccSFangrui Song 956*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1_bf16_x4( 957*207e5cccSFangrui Song // CHECK64-NEXT: entry: 958*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 959*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 960*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 961*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 962*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 963*207e5cccSFangrui Song // CHECK64-NEXT: ret void 964*207e5cccSFangrui Song // 965*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1_bf16_x4( 966*207e5cccSFangrui Song // CHECK32-NEXT: entry: 967*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 968*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 969*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 970*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 971*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 972*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 973*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 974*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 975*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]]) 976*207e5cccSFangrui Song // CHECK32-NEXT: ret void 977*207e5cccSFangrui Song // 978*207e5cccSFangrui Song void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) { 979*207e5cccSFangrui Song vst1_bf16_x4(ptr, val); 980*207e5cccSFangrui Song } 981*207e5cccSFangrui Song 982*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst1q_bf16_x4( 983*207e5cccSFangrui Song // CHECK64-NEXT: entry: 984*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 985*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 986*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 987*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 988*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 989*207e5cccSFangrui Song // CHECK64-NEXT: ret void 990*207e5cccSFangrui Song // 991*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst1q_bf16_x4( 992*207e5cccSFangrui Song // CHECK32-NEXT: entry: 993*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 994*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 995*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 996*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 997*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 998*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 999*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1000*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1001*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]]) 1002*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1003*207e5cccSFangrui Song // 1004*207e5cccSFangrui Song void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) { 1005*207e5cccSFangrui Song vst1q_bf16_x4(ptr, val); 1006*207e5cccSFangrui Song } 1007*207e5cccSFangrui Song 1008*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2_bf16( 1009*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1010*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1011*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 1012*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 1013*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1014*207e5cccSFangrui Song // 1015*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2_bf16( 1016*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1017*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1018*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 1019*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1020*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1021*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 2) 1022*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1023*207e5cccSFangrui Song // 1024*207e5cccSFangrui Song void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { 1025*207e5cccSFangrui Song vst2_bf16(ptr, val); 1026*207e5cccSFangrui Song } 1027*207e5cccSFangrui Song 1028*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2q_bf16( 1029*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1030*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1031*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 1032*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]]) 1033*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1034*207e5cccSFangrui Song // 1035*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2q_bf16( 1036*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1037*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1038*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 1039*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1040*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1041*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 2) 1042*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1043*207e5cccSFangrui Song // 1044*207e5cccSFangrui Song void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { 1045*207e5cccSFangrui Song vst2q_bf16(ptr, val); 1046*207e5cccSFangrui Song } 1047*207e5cccSFangrui Song 1048*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2_lane_bf16( 1049*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1050*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1051*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1 1052*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1053*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1054*207e5cccSFangrui Song // 1055*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2_lane_bf16( 1056*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1057*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1058*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1 1059*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1060*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1061*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2) 1062*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1063*207e5cccSFangrui Song // 1064*207e5cccSFangrui Song void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { 1065*207e5cccSFangrui Song vst2_lane_bf16(ptr, val, 1); 1066*207e5cccSFangrui Song } 1067*207e5cccSFangrui Song 1068*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst2q_lane_bf16( 1069*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1070*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1071*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1 1072*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1073*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1074*207e5cccSFangrui Song // 1075*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst2q_lane_bf16( 1076*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1077*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1078*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1 1079*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1080*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1081*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2) 1082*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1083*207e5cccSFangrui Song // 1084*207e5cccSFangrui Song void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { 1085*207e5cccSFangrui Song vst2q_lane_bf16(ptr, val, 7); 1086*207e5cccSFangrui Song } 1087*207e5cccSFangrui Song 1088*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3_bf16( 1089*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1090*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1091*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 1092*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 1093*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 1094*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1095*207e5cccSFangrui Song // 1096*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3_bf16( 1097*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1098*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1099*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 1100*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 1101*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1102*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1103*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1104*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 2) 1105*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1106*207e5cccSFangrui Song // 1107*207e5cccSFangrui Song void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { 1108*207e5cccSFangrui Song vst3_bf16(ptr, val); 1109*207e5cccSFangrui Song } 1110*207e5cccSFangrui Song 1111*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3q_bf16( 1112*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1113*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1114*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 1115*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 1116*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]]) 1117*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1118*207e5cccSFangrui Song // 1119*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3q_bf16( 1120*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1121*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1122*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 1123*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 1124*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1125*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1126*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1127*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 2) 1128*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1129*207e5cccSFangrui Song // 1130*207e5cccSFangrui Song void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { 1131*207e5cccSFangrui Song vst3q_bf16(ptr, val); 1132*207e5cccSFangrui Song } 1133*207e5cccSFangrui Song 1134*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3_lane_bf16( 1135*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1136*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1137*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1 1138*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2 1139*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1140*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1141*207e5cccSFangrui Song // 1142*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3_lane_bf16( 1143*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1144*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1145*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1 1146*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2 1147*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1148*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1149*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1150*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2) 1151*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1152*207e5cccSFangrui Song // 1153*207e5cccSFangrui Song void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { 1154*207e5cccSFangrui Song vst3_lane_bf16(ptr, val, 1); 1155*207e5cccSFangrui Song } 1156*207e5cccSFangrui Song 1157*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst3q_lane_bf16( 1158*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1159*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1160*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1 1161*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2 1162*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1163*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1164*207e5cccSFangrui Song // 1165*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst3q_lane_bf16( 1166*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1167*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1168*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1 1169*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2 1170*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1171*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1172*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1173*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2) 1174*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1175*207e5cccSFangrui Song // 1176*207e5cccSFangrui Song void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { 1177*207e5cccSFangrui Song vst3q_lane_bf16(ptr, val, 7); 1178*207e5cccSFangrui Song } 1179*207e5cccSFangrui Song 1180*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4_bf16( 1181*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1182*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1183*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 1184*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 1185*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 1186*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 1187*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1188*207e5cccSFangrui Song // 1189*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4_bf16( 1190*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1191*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1192*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 1193*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 1194*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 1195*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1196*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1197*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1198*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 1199*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 2) 1200*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1201*207e5cccSFangrui Song // 1202*207e5cccSFangrui Song void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { 1203*207e5cccSFangrui Song vst4_bf16(ptr, val); 1204*207e5cccSFangrui Song } 1205*207e5cccSFangrui Song 1206*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4q_bf16( 1207*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1208*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1209*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 1210*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 1211*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 1212*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]]) 1213*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1214*207e5cccSFangrui Song // 1215*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4q_bf16( 1216*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1217*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1218*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 1219*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 1220*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 1221*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1222*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1223*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1224*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1225*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 2) 1226*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1227*207e5cccSFangrui Song // 1228*207e5cccSFangrui Song void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { 1229*207e5cccSFangrui Song vst4q_bf16(ptr, val); 1230*207e5cccSFangrui Song } 1231*207e5cccSFangrui Song 1232*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4_lane_bf16( 1233*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1234*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0 1235*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1 1236*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2 1237*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3 1238*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]]) 1239*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1240*207e5cccSFangrui Song // 1241*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4_lane_bf16( 1242*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1243*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0 1244*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1 1245*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2 1246*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3 1247*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat> 1248*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat> 1249*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat> 1250*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat> 1251*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2) 1252*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1253*207e5cccSFangrui Song // 1254*207e5cccSFangrui Song void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { 1255*207e5cccSFangrui Song vst4_lane_bf16(ptr, val, 1); 1256*207e5cccSFangrui Song } 1257*207e5cccSFangrui Song 1258*207e5cccSFangrui Song // CHECK64-LABEL: @test_vst4q_lane_bf16( 1259*207e5cccSFangrui Song // CHECK64-NEXT: entry: 1260*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0 1261*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1 1262*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2 1263*207e5cccSFangrui Song // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3 1264*207e5cccSFangrui Song // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]]) 1265*207e5cccSFangrui Song // CHECK64-NEXT: ret void 1266*207e5cccSFangrui Song // 1267*207e5cccSFangrui Song // CHECK32-LABEL: @test_vst4q_lane_bf16( 1268*207e5cccSFangrui Song // CHECK32-NEXT: entry: 1269*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0 1270*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1 1271*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2 1272*207e5cccSFangrui Song // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3 1273*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat> 1274*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat> 1275*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat> 1276*207e5cccSFangrui Song // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat> 1277*207e5cccSFangrui Song // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2) 1278*207e5cccSFangrui Song // CHECK32-NEXT: ret void 1279*207e5cccSFangrui Song // 1280*207e5cccSFangrui Song void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { 1281*207e5cccSFangrui Song vst4q_lane_bf16(ptr, val, 7); 1282*207e5cccSFangrui Song } 1283