131914882SAlex Richardson/* memcmp - compare memory 231914882SAlex Richardson * 3*072a4ba8SAndrew Turner * Copyright (c) 2013-2022, Arm Limited. 4*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 531914882SAlex Richardson */ 631914882SAlex Richardson 731914882SAlex Richardson/* Assumptions: 831914882SAlex Richardson * 9d49ad206SAndrew Turner * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 1031914882SAlex Richardson */ 1131914882SAlex Richardson 12*072a4ba8SAndrew Turner#include "asmdefs.h" 1331914882SAlex Richardson 1431914882SAlex Richardson#define src1 x0 1531914882SAlex Richardson#define src2 x1 1631914882SAlex Richardson#define limit x2 1731914882SAlex Richardson#define result w0 1831914882SAlex Richardson 1931914882SAlex Richardson#define data1 x3 2031914882SAlex Richardson#define data1w w3 21d49ad206SAndrew Turner#define data2 x4 22d49ad206SAndrew Turner#define data2w w4 23d49ad206SAndrew Turner#define data3 x5 24d49ad206SAndrew Turner#define data3w w5 25d49ad206SAndrew Turner#define data4 x6 26d49ad206SAndrew Turner#define data4w w6 27d49ad206SAndrew Turner#define tmp x6 28d49ad206SAndrew Turner#define src1end x7 29d49ad206SAndrew Turner#define src2end x8 30d49ad206SAndrew Turner 3131914882SAlex Richardson 3231914882SAlex RichardsonENTRY (__memcmp_aarch64) 33d49ad206SAndrew Turner cmp limit, 16 34d49ad206SAndrew Turner b.lo L(less16) 35d49ad206SAndrew Turner ldp data1, data3, [src1] 36d49ad206SAndrew Turner ldp data2, data4, [src2] 37d49ad206SAndrew Turner ccmp data1, data2, 0, ne 38d49ad206SAndrew Turner ccmp data3, data4, 0, eq 39d49ad206SAndrew Turner b.ne L(return2) 40d49ad206SAndrew Turner 41d49ad206SAndrew Turner add src1end, src1, limit 42d49ad206SAndrew Turner add src2end, src2, limit 43d49ad206SAndrew Turner cmp limit, 32 44d49ad206SAndrew Turner b.ls L(last_bytes) 45d49ad206SAndrew Turner cmp limit, 160 46d49ad206SAndrew Turner b.hs L(loop_align) 47d49ad206SAndrew Turner sub limit, limit, 32 48d49ad206SAndrew Turner 49d49ad206SAndrew Turner .p2align 4 50d49ad206SAndrew TurnerL(loop32): 51d49ad206SAndrew Turner ldp data1, data3, [src1, 16] 52d49ad206SAndrew Turner ldp data2, data4, [src2, 16] 5331914882SAlex Richardson cmp data1, data2 54d49ad206SAndrew Turner ccmp data3, data4, 0, eq 55d49ad206SAndrew Turner b.ne L(return2) 56d49ad206SAndrew Turner cmp limit, 16 5731914882SAlex Richardson b.ls L(last_bytes) 5831914882SAlex Richardson 59d49ad206SAndrew Turner ldp data1, data3, [src1, 32] 60d49ad206SAndrew Turner ldp data2, data4, [src2, 32] 6131914882SAlex Richardson cmp data1, data2 62d49ad206SAndrew Turner ccmp data3, data4, 0, eq 63d49ad206SAndrew Turner b.ne L(return2) 64d49ad206SAndrew Turner add src1, src1, 32 65d49ad206SAndrew Turner add src2, src2, 32 66d49ad206SAndrew TurnerL(last64): 67d49ad206SAndrew Turner subs limit, limit, 32 68d49ad206SAndrew Turner b.hi L(loop32) 6931914882SAlex Richardson 7031914882SAlex Richardson /* Compare last 1-16 bytes using unaligned access. */ 7131914882SAlex RichardsonL(last_bytes): 72d49ad206SAndrew Turner ldp data1, data3, [src1end, -16] 73d49ad206SAndrew Turner ldp data2, data4, [src2end, -16] 74d49ad206SAndrew TurnerL(return2): 7531914882SAlex Richardson cmp data1, data2 76d49ad206SAndrew Turner csel data1, data1, data3, ne 77d49ad206SAndrew Turner csel data2, data2, data4, ne 7831914882SAlex Richardson 7931914882SAlex Richardson /* Compare data bytes and set return value to 0, -1 or 1. */ 8031914882SAlex RichardsonL(return): 8131914882SAlex Richardson#ifndef __AARCH64EB__ 8231914882SAlex Richardson rev data1, data1 8331914882SAlex Richardson rev data2, data2 8431914882SAlex Richardson#endif 8531914882SAlex Richardson cmp data1, data2 8631914882SAlex Richardson cset result, ne 8731914882SAlex Richardson cneg result, result, lo 8831914882SAlex Richardson ret 8931914882SAlex Richardson 9031914882SAlex Richardson .p2align 4 91d49ad206SAndrew TurnerL(less16): 92d49ad206SAndrew Turner add src1end, src1, limit 93d49ad206SAndrew Turner add src2end, src2, limit 94d49ad206SAndrew Turner tbz limit, 3, L(less8) 95d49ad206SAndrew Turner ldr data1, [src1] 96d49ad206SAndrew Turner ldr data2, [src2] 97d49ad206SAndrew Turner ldr data3, [src1end, -8] 98d49ad206SAndrew Turner ldr data4, [src2end, -8] 99d49ad206SAndrew Turner b L(return2) 100d49ad206SAndrew Turner 101d49ad206SAndrew Turner .p2align 4 10231914882SAlex RichardsonL(less8): 103d49ad206SAndrew Turner tbz limit, 2, L(less4) 104d49ad206SAndrew Turner ldr data1w, [src1] 105d49ad206SAndrew Turner ldr data2w, [src2] 106d49ad206SAndrew Turner ldr data3w, [src1end, -4] 107d49ad206SAndrew Turner ldr data4w, [src2end, -4] 108d49ad206SAndrew Turner b L(return2) 109d49ad206SAndrew Turner 110d49ad206SAndrew TurnerL(less4): 111d49ad206SAndrew Turner tbz limit, 1, L(less2) 112d49ad206SAndrew Turner ldrh data1w, [src1] 113d49ad206SAndrew Turner ldrh data2w, [src2] 11431914882SAlex Richardson cmp data1w, data2w 11531914882SAlex Richardson b.ne L(return) 116d49ad206SAndrew TurnerL(less2): 117d49ad206SAndrew Turner mov result, 0 118d49ad206SAndrew Turner tbz limit, 0, L(return_zero) 119d49ad206SAndrew Turner ldrb data1w, [src1end, -1] 120d49ad206SAndrew Turner ldrb data2w, [src2end, -1] 12131914882SAlex Richardson sub result, data1w, data2w 122d49ad206SAndrew TurnerL(return_zero): 123d49ad206SAndrew Turner ret 124d49ad206SAndrew Turner 125d49ad206SAndrew TurnerL(loop_align): 126d49ad206SAndrew Turner ldp data1, data3, [src1, 16] 127d49ad206SAndrew Turner ldp data2, data4, [src2, 16] 128d49ad206SAndrew Turner cmp data1, data2 129d49ad206SAndrew Turner ccmp data3, data4, 0, eq 130d49ad206SAndrew Turner b.ne L(return2) 131d49ad206SAndrew Turner 132d49ad206SAndrew Turner /* Align src2 and adjust src1, src2 and limit. */ 133d49ad206SAndrew Turner and tmp, src2, 15 134d49ad206SAndrew Turner sub tmp, tmp, 16 135d49ad206SAndrew Turner sub src2, src2, tmp 136d49ad206SAndrew Turner add limit, limit, tmp 137d49ad206SAndrew Turner sub src1, src1, tmp 138d49ad206SAndrew Turner sub limit, limit, 64 + 16 139d49ad206SAndrew Turner 140d49ad206SAndrew Turner .p2align 4 141d49ad206SAndrew TurnerL(loop64): 142d49ad206SAndrew Turner ldr q0, [src1, 16] 143d49ad206SAndrew Turner ldr q1, [src2, 16] 144d49ad206SAndrew Turner subs limit, limit, 64 145d49ad206SAndrew Turner ldr q2, [src1, 32] 146d49ad206SAndrew Turner ldr q3, [src2, 32] 147d49ad206SAndrew Turner eor v0.16b, v0.16b, v1.16b 148d49ad206SAndrew Turner eor v1.16b, v2.16b, v3.16b 149d49ad206SAndrew Turner ldr q2, [src1, 48] 150d49ad206SAndrew Turner ldr q3, [src2, 48] 151d49ad206SAndrew Turner umaxp v0.16b, v0.16b, v1.16b 152d49ad206SAndrew Turner ldr q4, [src1, 64]! 153d49ad206SAndrew Turner ldr q5, [src2, 64]! 154d49ad206SAndrew Turner eor v1.16b, v2.16b, v3.16b 155d49ad206SAndrew Turner eor v2.16b, v4.16b, v5.16b 156d49ad206SAndrew Turner umaxp v1.16b, v1.16b, v2.16b 157d49ad206SAndrew Turner umaxp v0.16b, v0.16b, v1.16b 158d49ad206SAndrew Turner umaxp v0.16b, v0.16b, v0.16b 159d49ad206SAndrew Turner fmov tmp, d0 160d49ad206SAndrew Turner ccmp tmp, 0, 0, hi 161d49ad206SAndrew Turner b.eq L(loop64) 162d49ad206SAndrew Turner 163d49ad206SAndrew Turner /* If equal, process last 1-64 bytes using scalar loop. */ 164d49ad206SAndrew Turner add limit, limit, 64 + 16 165d49ad206SAndrew Turner cbz tmp, L(last64) 166d49ad206SAndrew Turner 167d49ad206SAndrew Turner /* Determine the 8-byte aligned offset of the first difference. */ 168d49ad206SAndrew Turner#ifdef __AARCH64EB__ 169d49ad206SAndrew Turner rev16 tmp, tmp 170d49ad206SAndrew Turner#endif 171d49ad206SAndrew Turner rev tmp, tmp 172d49ad206SAndrew Turner clz tmp, tmp 173d49ad206SAndrew Turner bic tmp, tmp, 7 174d49ad206SAndrew Turner sub tmp, tmp, 48 175d49ad206SAndrew Turner ldr data1, [src1, tmp] 176d49ad206SAndrew Turner ldr data2, [src2, tmp] 177d49ad206SAndrew Turner#ifndef __AARCH64EB__ 178d49ad206SAndrew Turner rev data1, data1 179d49ad206SAndrew Turner rev data2, data2 180d49ad206SAndrew Turner#endif 181d49ad206SAndrew Turner mov result, 1 182d49ad206SAndrew Turner cmp data1, data2 183d49ad206SAndrew Turner cneg result, result, lo 18431914882SAlex Richardson ret 18531914882SAlex Richardson 18631914882SAlex RichardsonEND (__memcmp_aarch64) 187