1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2016 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29 30%include "options.asm" 31%include "stdmac.asm" 32 33%ifndef UTILS_ASM 34%define UTILS_ASM 35; compare macro 36 37;; sttni2 is faster, but it can't be debugged 38;; so following code is based on "mine5" 39 40;; compares 8 bytes at a time, using xor 41;; assumes the input buffer has size at least 8 42;; compare_r src1, src2, result, result_max, tmp 43%macro compare_r 5 44%define %%src1 %1 45%define %%src2 %2 46%define %%result %3 47%define %%result_max %4 48%define %%tmp %5 49%define %%tmp16 %5w ; tmp as a 16-bit register 50 51 sub %%result_max, 16 52 cmp %%result, %%result_max 53 jg %%_by_8 54 55%%loop1: 56 mov %%tmp, [%%src1 + %%result] 57 xor %%tmp, [%%src2 + %%result] 58 jnz %%miscompare_reg 59 add %%result, 8 60 61 mov %%tmp, [%%src1 + %%result] 62 xor %%tmp, [%%src2 + %%result] 63 jnz %%miscompare_reg 64 add %%result, 8 65 cmp %%result, %%result_max 66 jle %%loop1 67 68%%_by_8: 69 add %%result_max, 8 70 cmp %%result, %%result_max 71 jg %%_cmp_last 72 73 ; compare last two bytes 74 mov %%tmp, [%%src1 + %%result] 75 xor %%tmp, [%%src2 + %%result] 76 jnz %%miscompare_reg 77 add %%result, 8 78 79%%_cmp_last: 80 add %%result_max, 8 81 cmp %%result, %%result_max 82 je %%end 83 84 lea %%result, [%%result_max - 8] 85 86 mov %%tmp, [%%src1 + %%result] 87 xor %%tmp, [%%src2 + %%result] 88 jnz %%miscompare_reg 89 add %%result, 8 90 jmp %%end 91 92%%miscompare_reg: 93 bsf %%tmp, %%tmp 94 shr %%tmp, 3 95 add %%result, %%tmp 96%%end: 97%endm 98 99;; compares 16 bytes at a time, using pcmpeqb/pmovmskb 100;; assumes the input buffer has size at least 8 101;; compare_x src1, src2, result, result_max, tmp, xtmp1, xtmp2 102%macro compare_x 7 103%define %%src1 %1 104%define %%src2 %2 105%define %%result %3 ; Accumulator for match_length 106%define %%result_max %4 107%define %%tmp %5 108%define %%tmp16 %5w ; tmp as a 16-bit register 109%define %%tmp32 %5d ; tmp as a 32-bit register 110%define %%xtmp %6 111%define %%xtmp2 %7 112 113 sub %%result_max, 32 114 cmp %%result, %%result_max 115 jg %%_by_16 116 117%%loop1: 118 MOVDQU %%xtmp, [%%src1 + %%result] 119 MOVDQU %%xtmp2, [%%src2 + %%result] 120 PCMPEQB %%xtmp, %%xtmp, %%xtmp2 121 PMOVMSKB %%tmp32, %%xtmp 122 xor %%tmp, 0xFFFF 123 jnz %%miscompare_vect 124 add %%result, 16 125 126 MOVDQU %%xtmp, [%%src1 + %%result] 127 MOVDQU %%xtmp2, [%%src2 + %%result] 128 PCMPEQB %%xtmp, %%xtmp, %%xtmp2 129 PMOVMSKB %%tmp32, %%xtmp 130 xor %%tmp, 0xFFFF 131 jnz %%miscompare_vect 132 add %%result, 16 133 134 cmp %%result, %%result_max 135 jle %%loop1 136 137%%_by_16: 138 add %%result_max, 16 139 cmp %%result, %%result_max 140 jg %%_by_8 141 142 MOVDQU %%xtmp, [%%src1 + %%result] 143 MOVDQU %%xtmp2, [%%src2 + %%result] 144 PCMPEQB %%xtmp, %%xtmp, %%xtmp2 145 PMOVMSKB %%tmp32, %%xtmp 146 xor %%tmp, 0xFFFF 147 jnz %%miscompare_vect 148 add %%result, 16 149 150%%_by_8: 151 add %%result_max, 8 152 cmp %%result, %%result_max 153 jg %%_cmp_last 154 155 ; compare last two bytes 156 mov %%tmp, [%%src1 + %%result] 157 xor %%tmp, [%%src2 + %%result] 158 jnz %%miscompare_reg 159 add %%result, 8 160 161%%_cmp_last: 162 add %%result_max, 8 163 cmp %%result, %%result_max 164 je %%end 165 166 lea %%result, [%%result_max - 8] 167 168 mov %%tmp, [%%src1 + %%result] 169 xor %%tmp, [%%src2 + %%result] 170 jnz %%miscompare_reg 171 add %%result, 8 172 jmp %%end 173 174%%miscompare_reg: 175 bsf %%tmp, %%tmp 176 shr %%tmp, 3 177 add %%result, %%tmp 178 jmp %%end 179 180%%miscompare_vect: 181 bsf %%tmp, %%tmp 182 add %%result, %%tmp 183%%end: 184%endm 185 186;; compares 32 bytes at a time, using pcmpeqb/pmovmskb 187;; assumes the input buffer has size at least 8 188;; compare_y src1, src2, result, result_max, tmp, xtmp1, xtmp2 189%macro compare_y 7 190%define %%src1 %1 191%define %%src2 %2 192%define %%result %3 ; Accumulator for match_length 193%define %%result_max %4 194%define %%tmp %5 195%define %%tmp16 %5w ; tmp as a 16-bit register 196%define %%tmp32 %5d ; tmp as a 32-bit register 197%define %%ytmp %6 198%define %%ytmp2 %7 199 200 sub %%result_max, 64 201 cmp %%result, %%result_max 202 jg %%_by_32 203 204%%loop1: 205 vmovdqu %%ytmp, [%%src1 + %%result] 206 vmovdqu %%ytmp2, [%%src2 + %%result] 207 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2 208 vpmovmskb %%tmp, %%ytmp 209 xor %%tmp32, 0xFFFFFFFF 210 jnz %%miscompare_vect 211 add %%result, 32 212 213 vmovdqu %%ytmp, [%%src1 + %%result] 214 vmovdqu %%ytmp2, [%%src2 + %%result] 215 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2 216 vpmovmskb %%tmp, %%ytmp 217 xor %%tmp32, 0xFFFFFFFF 218 jnz %%miscompare_vect 219 add %%result, 32 220 221 cmp %%result, %%result_max 222 jle %%loop1 223 224%%_by_32: 225 add %%result_max, 32 226 cmp %%result, %%result_max 227 jg %%_by_16 228 229 vmovdqu %%ytmp, [%%src1 + %%result] 230 vmovdqu %%ytmp2, [%%src2 + %%result] 231 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2 232 vpmovmskb %%tmp, %%ytmp 233 xor %%tmp32, 0xFFFFFFFF 234 jnz %%miscompare_vect 235 add %%result, 32 236 237%%_by_16: 238 add %%result_max, 16 239 cmp %%result, %%result_max 240 jg %%_by_8 241 242 vmovdqu %%ytmp %+ x, [%%src1 + %%result] 243 vmovdqu %%ytmp2 %+ x, [%%src2 + %%result] 244 vpcmpeqb %%ytmp %+ x, %%ytmp %+ x, %%ytmp2 %+ x 245 vpmovmskb %%tmp, %%ytmp %+ x 246 xor %%tmp32, 0xFFFF 247 jnz %%miscompare_vect 248 add %%result, 16 249 250%%_by_8: 251 add %%result_max, 8 252 cmp %%result, %%result_max 253 jg %%_cmp_last 254 255 mov %%tmp, [%%src1 + %%result] 256 xor %%tmp, [%%src2 + %%result] 257 jnz %%miscompare_reg 258 add %%result, 8 259 260%%_cmp_last: 261 add %%result_max, 8 262 cmp %%result, %%result_max 263 je %%end 264 265 lea %%result, [%%result_max - 8] 266 267 ; compare last two bytes 268 mov %%tmp, [%%src1 + %%result] 269 xor %%tmp, [%%src2 + %%result] 270 jnz %%miscompare_reg 271 add %%result, 8 272 jmp %%end 273 274%%miscompare_reg: 275 bsf %%tmp, %%tmp 276 shr %%tmp, 3 277 add %%result, %%tmp 278 jmp %%end 279 280%%miscompare_vect: 281 tzcnt %%tmp, %%tmp 282 add %%result, %%tmp 283%%end: 284%endm 285 286;; compares 64 bytes at a time 287;; compare_z src1, src2, result, result_max, tmp, ktmp, ztmp1, ztmp2 288;; Clobbers result_max 289%macro compare_z 8 290%define %%src1 %1 291%define %%src2 %2 292%define %%result %3 ; Accumulator for match_length 293%define %%result_max %4 294%define %%tmp %5 ; tmp as a 16-bit register 295%define %%ktmp %6 296%define %%ztmp %7 297%define %%ztmp2 %8 298 299 sub %%result_max, 128 300 cmp %%result, %%result_max 301 jg %%_by_64 302 303%%loop1: 304 vmovdqu8 %%ztmp, [%%src1 + %%result] 305 vmovdqu8 %%ztmp2, [%%src2 + %%result] 306 vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ 307 ktestq %%ktmp, %%ktmp 308 jnz %%miscompare 309 add %%result, 64 310 311 vmovdqu8 %%ztmp, [%%src1 + %%result] 312 vmovdqu8 %%ztmp2, [%%src2 + %%result] 313 vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ 314 ktestq %%ktmp, %%ktmp 315 jnz %%miscompare 316 add %%result, 64 317 318 cmp %%result, %%result_max 319 jle %%loop1 320 321%%_by_64: 322 add %%result_max, 64 323 cmp %%result, %%result_max 324 jg %%_less_than_64 325 326 vmovdqu8 %%ztmp, [%%src1 + %%result] 327 vmovdqu8 %%ztmp2, [%%src2 + %%result] 328 vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ 329 ktestq %%ktmp, %%ktmp 330 jnz %%miscompare 331 add %%result, 64 332 333%%_less_than_64: 334 add %%result_max, 64 335 sub %%result_max, %%result 336 jle %%end 337 338 mov %%tmp, -1 339 bzhi %%tmp, %%tmp, %%result_max 340 kmovq %%ktmp, %%tmp 341 342 vmovdqu8 %%ztmp {%%ktmp}{z}, [%%src1 + %%result] 343 vmovdqu8 %%ztmp2 {%%ktmp}{z}, [%%src2 + %%result] 344 vpcmpb %%ktmp, %%ztmp, %%ztmp2, NEQ 345 ktestq %%ktmp, %%ktmp 346 jnz %%miscompare 347 add %%result, %%result_max 348 349 jmp %%end 350%%miscompare: 351 kmovq %%tmp, %%ktmp 352 tzcnt %%tmp, %%tmp 353 add %%result, %%tmp 354%%end: 355%endm 356 357%macro compare250 7 358%define %%src1 %1 359%define %%src2 %2 360%define %%result %3 361%define %%result_max %4 362%define %%tmp %5 363%define %%xtmp0 %6x 364%define %%xtmp1 %7x 365%define %%ytmp0 %6 366%define %%ytmp1 %7 367 368 mov %%tmp, 250 369 cmp %%result_max, 250 370 cmovg %%result_max, %%tmp 371 372%if (COMPARE_TYPE == 1) 373 compare_r %%src1, %%src2, %%result, %%result_max, %%tmp 374%elif (COMPARE_TYPE == 2) 375 compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1 376%elif (COMPARE_TYPE == 3) 377 compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1 378%else 379%error Unknown Compare type COMPARE_TYPE 380 % error 381%endif 382%endmacro 383 384; Assumes the buffer has at least 8 bytes 385; Accumulates match length onto result 386%macro compare_large 7 387%define %%src1 %1 388%define %%src2 %2 389%define %%result %3 390%define %%result_max %4 391%define %%tmp %5 392%define %%xtmp0 %6x 393%define %%xtmp1 %7x 394%define %%ytmp0 %6 395%define %%ytmp1 %7 396 397%if (COMPARE_TYPE == 1) 398 compare_r %%src1, %%src2, %%result, %%result_max, %%tmp 399%elif (COMPARE_TYPE == 2) 400 compare_x %%src1, %%src2, %%result, %%result_max, %%tmp, %%xtmp0, %%xtmp1 401%elif (COMPARE_TYPE == 3) 402 compare_y %%src1, %%src2, %%result, %%result_max, %%tmp, %%ytmp0, %%ytmp1 403%else 404%error Unknown Compare type COMPARE_TYPE 405 % error 406%endif 407%endmacro 408 409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 410;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 411 412;; compare size, src1, src2, result, tmp 413%macro compare 5 414%define %%size %1 415%define %%src1 %2 416%define %%src2 %3 417%define %%result %4 418%define %%tmp %5 419%define %%tmp8 %5b ; tmp as a 8-bit register 420 421 xor %%result, %%result 422 sub %%size, 7 423 jle %%lab2 424%%loop1: 425 mov %%tmp, [%%src1 + %%result] 426 xor %%tmp, [%%src2 + %%result] 427 jnz %%miscompare 428 add %%result, 8 429 sub %%size, 8 430 jg %%loop1 431%%lab2: 432 ;; if we fall through from above, we have found no mismatches, 433 ;; %%size+7 is the number of bytes left to look at, and %%result is the 434 ;; number of bytes that have matched 435 add %%size, 7 436 jle %%end 437%%loop3: 438 mov %%tmp8, [%%src1 + %%result] 439 cmp %%tmp8, [%%src2 + %%result] 440 jne %%end 441 inc %%result 442 dec %%size 443 jg %%loop3 444 jmp %%end 445%%miscompare: 446 bsf %%tmp, %%tmp 447 shr %%tmp, 3 448 add %%result, %%tmp 449%%end: 450%endm 451 452%endif ;UTILS_ASM 453