1dnl IA-64 mpn_lshiftc. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2010 Free Software 6dnl Foundation, Inc. 7 8dnl This file is part of the GNU MP Library. 9 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of the GNU Lesser General Public License as published 12dnl by the Free Software Foundation; either version 3 of the License, or (at 13dnl your option) any later version. 14 15dnl The GNU MP Library is distributed in the hope that it will be useful, but 16dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18dnl License for more details. 19 20dnl You should have received a copy of the GNU Lesser General Public License 21dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 22 23include(`../config.m4') 24 25C cycles/limb 26C Itanium: ? 27C Itanium 2: 1.25 28 29C This code is scheduled deeply since the plain shift instructions shr and shl 30C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of 31C these instructions cause a 10 cycle replay trap on Itanium. 32 33C The ld8 scheduling should probably be decreased to make the function smaller. 34C Good lfetch will make sure we never stall anyway. 35 36C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair 37C at cycle 2. Judicious use of predicates could allow us to issue more ld8's 38C in the prologue. 39 40 41C INPUT PARAMETERS 42define(`rp', `r32') 43define(`up', `r33') 44define(`n', `r34') 45define(`cnt',`r35') 46 47define(`tnc',`r9') 48 49define(`FSH',`shl') 50define(`BSH',`shr.u') 51define(`UPD',`-8') 52define(`POFF',`-512') 53define(`PUPD',`-32') 54define(`func',`mpn_lshiftc') 55 56ASM_START() 57PROLOGUE(mpn_lshiftc) 58 .prologue 59 .save ar.lc, r2 60 .body 61ifdef(`HAVE_ABI_32', 62` addp4 rp = 0, rp C M I 63 addp4 up = 0, up C M I 64 sxt4 n = n C M I 65 zxt4 cnt = cnt C I 66 ;; 67') 68 69 {.mmi; nop 0 C M I 70 and r14 = 3, n C M I 71 mov.i r2 = ar.lc C I0 72}{.mmi; add r15 = -1, n C M I 73 sub tnc = 64, cnt C M I 74 nop 0 75 ;; 76}{.mmi; cmp.eq p6, p0 = 1, r14 C M I 77 cmp.eq p7, p0 = 2, r14 C M I 78 shr.u n = r15, 2 C I0 79}{.mmi; cmp.eq p8, p0 = 3, r14 C M I 80 shladd up = r15, 3, up C M I 81 shladd rp = r15, 3, rp C M I 82 ;; 83}{.mmi; add r11 = POFF, up C M I 84 ld8 r10 = [up], UPD C M01 85 mov.i ar.lc = n C I0 86}{.bbb; 87 (p6) br.dptk .Lb01 88 (p7) br.dptk .Lb10 89 (p8) br.dptk .Lb11 90 ;; } 91 92.Lb00: 93 ld8 r19 = [up], UPD 94 ;; 95 ld8 r16 = [up], UPD 96 ;; 97 ld8 r17 = [up], UPD 98 BSH r8 = r10, tnc 99 br.cloop.dptk L(gt4) 100 ;; 101 FSH r24 = r10, cnt 102 BSH r25 = r19, tnc 103 ;; 104 FSH r26 = r19, cnt 105 BSH r27 = r16, tnc 106 ;; 107 FSH r20 = r16, cnt 108 BSH r21 = r17, tnc 109 ;; 110 or r14 = r25, r24 111 FSH r22 = r17, cnt 112 ;; 113 or r15 = r27, r26 114 sub r31 = -1, r14 115 br .Lr4 116 117L(gt4): 118 {.mmi; nop 0 119 nop 0 120 FSH r24 = r10, cnt 121}{.mmi; ld8 r18 = [up], UPD 122 nop 0 123 BSH r25 = r19, tnc 124 ;; } 125 {.mmi; nop 0 126 nop 0 127 FSH r26 = r19, cnt 128}{.mmi; ld8 r19 = [up], UPD 129 nop 0 130 BSH r27 = r16, tnc 131 ;; } 132 {.mmi; nop 0 133 nop 0 134 FSH r20 = r16, cnt 135}{.mmi; ld8 r16 = [up], UPD 136 nop 0 137 BSH r21 = r17, tnc 138 ;; } 139 {.mmi; nop 0 140 or r14 = r25, r24 141 FSH r22 = r17, cnt 142}{.mib; ld8 r17 = [up], UPD 143 BSH r23 = r18, tnc 144 br.cloop.dptk L(gt8) 145 ;; } 146 {.mmi; nop 0 147 or r15 = r27, r26 148 FSH r24 = r18, cnt 149}{.mib; sub r31 = -1, r14 150 BSH r25 = r19, tnc 151 br .Lr8 } 152 153L(gt8): 154 or r15 = r27, r26 155 FSH r24 = r18, cnt 156 ld8 r18 = [up], UPD 157 sub r31 = -1, r14 158 BSH r25 = r19, tnc 159 br .LL00 160 161.Lb01: 162 br.cloop.dptk L(gt1) 163 ;; 164 BSH r8 = r10, tnc 165 FSH r22 = r10, cnt 166 ;; 167 sub r31 = -1, r22 168 br .Lr1 169 ;; 170L(gt1): 171 ld8 r18 = [up], UPD 172 BSH r8 = r10, tnc 173 FSH r22 = r10, cnt 174 ;; 175 ld8 r19 = [up], UPD 176 ;; 177 ld8 r16 = [up], UPD 178 ;; 179 ld8 r17 = [up], UPD 180 BSH r23 = r18, tnc 181 br.cloop.dptk L(gt5) 182 ;; 183 nop 0 184 FSH r24 = r18, cnt 185 BSH r25 = r19, tnc 186 ;; 187 nop 0 188 FSH r26 = r19, cnt 189 BSH r27 = r16, tnc 190 ;; 191 or r15 = r23, r22 192 FSH r20 = r16, cnt 193 BSH r21 = r17, tnc 194 ;; 195 or r14 = r25, r24 196 FSH r22 = r17, cnt 197 sub r31 = -1, r15 198 br .Lr5 199 200L(gt5): 201 {.mmi; nop 0 202 nop 0 203 FSH r24 = r18, cnt 204}{.mmi; ld8 r18 = [up], UPD 205 nop 0 206 BSH r25 = r19, tnc 207 ;; } 208 {.mmi; nop 0 209 nop 0 210 FSH r26 = r19, cnt 211}{.mmi; ld8 r19 = [up], UPD 212 nop 0 213 BSH r27 = r16, tnc 214 ;; } 215 {.mmi; nop 0 216 or r15 = r23, r22 217 FSH r20 = r16, cnt 218}{.mmi; ld8 r16 = [up], UPD 219 nop 0 220 BSH r21 = r17, tnc 221 ;; } 222 {.mmi; or r14 = r25, r24 223 sub r31 = -1, r15 224 FSH r22 = r17, cnt 225}{.mib; ld8 r17 = [up], UPD 226 BSH r23 = r18, tnc 227 br L(end) 228 ;; } 229 230.Lb10: 231 ld8 r17 = [up], UPD 232 br.cloop.dptk L(gt2) 233 ;; 234 BSH r8 = r10, tnc 235 FSH r20 = r10, cnt 236 ;; 237 BSH r21 = r17, tnc 238 FSH r22 = r17, cnt 239 ;; 240 or r14 = r21, r20 241 ;; 242 sub r31 = -1, r14 243 br .Lr2 244 ;; 245L(gt2): 246 ld8 r18 = [up], UPD 247 BSH r8 = r10, tnc 248 FSH r20 = r10, cnt 249 ;; 250 ld8 r19 = [up], UPD 251 ;; 252 ld8 r16 = [up], UPD 253 BSH r21 = r17, tnc 254 FSH r22 = r17, cnt 255 ;; 256 ld8 r17 = [up], UPD 257 BSH r23 = r18, tnc 258 br.cloop.dptk L(gt6) 259 ;; 260 nop 0 261 FSH r24 = r18, cnt 262 BSH r25 = r19, tnc 263 ;; 264 or r14 = r21, r20 265 FSH r26 = r19, cnt 266 BSH r27 = r16, tnc 267 ;; 268 {.mmi; nop 0 269 or r15 = r23, r22 270 FSH r20 = r16, cnt 271}{.mib; sub r31 = -1, r14 272 BSH r21 = r17, tnc 273 br .Lr6 274 ;; } 275L(gt6): 276 {.mmi; nop 0 277 nop 0 278 FSH r24 = r18, cnt 279}{.mmi; ld8 r18 = [up], UPD 280 nop 0 281 BSH r25 = r19, tnc 282 ;; } 283 {.mmi; nop 0 284 or r14 = r21, r20 285 FSH r26 = r19, cnt 286}{.mmi; ld8 r19 = [up], UPD 287 nop 0 288 BSH r27 = r16, tnc 289 ;; } 290 {.mmi; or r15 = r23, r22 291 sub r31 = -1, r14 292 FSH r20 = r16, cnt 293}{.mib; ld8 r16 = [up], UPD 294 BSH r21 = r17, tnc 295 br .LL10 296} 297 298.Lb11: 299 ld8 r16 = [up], UPD 300 ;; 301 ld8 r17 = [up], UPD 302 BSH r8 = r10, tnc 303 FSH r26 = r10, cnt 304 br.cloop.dptk L(gt3) 305 ;; 306 BSH r27 = r16, tnc 307 ;; 308 FSH r20 = r16, cnt 309 BSH r21 = r17, tnc 310 ;; 311 FSH r22 = r17, cnt 312 ;; 313 or r15 = r27, r26 314 ;; 315 or r14 = r21, r20 316 sub r31 = -1, r15 317 br .Lr3 318 ;; 319L(gt3): 320 ld8 r18 = [up], UPD 321 ;; 322 ld8 r19 = [up], UPD 323 BSH r27 = r16, tnc 324 ;; 325 {.mmi; nop 0 326 nop 0 327 FSH r20 = r16, cnt 328}{.mmi; ld8 r16 = [up], UPD 329 nop 0 330 BSH r21 = r17, tnc 331 ;; } 332 {.mmi nop 0 333 nop 0 334 FSH r22 = r17, cnt 335}{.mib; ld8 r17 = [up], UPD 336 BSH r23 = r18, tnc 337 br.cloop.dptk L(gt7) 338 ;; } 339 or r15 = r27, r26 340 FSH r24 = r18, cnt 341 BSH r25 = r19, tnc 342 ;; 343 {.mmi; nop 0 344 or r14 = r21, r20 345 FSH r26 = r19, cnt 346}{.mib; sub r31 = -1, r15 347 BSH r27 = r16, tnc 348 br .Lr7 349} 350L(gt7): 351 {.mmi; nop 0 352 or r15 = r27, r26 353 FSH r24 = r18, cnt 354}{.mmi; ld8 r18 = [up], UPD 355 nop 0 356 BSH r25 = r19, tnc 357 ;; } 358 {.mmi; or r14 = r21, r20 359 sub r31 = -1, r15 360 FSH r26 = r19, cnt 361}{.mib; ld8 r19 = [up], UPD 362 BSH r27 = r16, tnc 363 br .LL11 364} 365 366C *** MAIN LOOP START *** 367 ALIGN(32) 368L(top): 369.LL01: 370 {.mmi; st8 [rp] = r31, UPD C M2 371 or r15 = r27, r26 C M3 372 FSH r24 = r18, cnt C I0 373}{.mmi; ld8 r18 = [up], UPD C M0 374 sub r31 = -1, r14 C M1 375 BSH r25 = r19, tnc C I1 376 ;; } 377.LL00: 378 {.mmi; st8 [rp] = r31, UPD 379 or r14 = r21, r20 380 FSH r26 = r19, cnt 381}{.mmi; ld8 r19 = [up], UPD 382 sub r31 = -1, r15 383 BSH r27 = r16, tnc 384 ;; } 385.LL11: 386 {.mmi; st8 [rp] = r31, UPD 387 or r15 = r23, r22 388 FSH r20 = r16, cnt 389}{.mmi; ld8 r16 = [up], UPD 390 sub r31 = -1, r14 391 BSH r21 = r17, tnc 392 ;; } 393.LL10: 394 {.mmi; st8 [rp] = r31, UPD 395 or r14 = r25, r24 396 FSH r22 = r17, cnt 397}{.mmi; ld8 r17 = [up], UPD 398 sub r31 = -1, r15 399 BSH r23 = r18, tnc 400 ;; } 401L(end): lfetch [r11], PUPD 402 br.cloop.dptk L(top) 403C *** MAIN LOOP END *** 404 405 {.mmi; st8 [rp] = r31, UPD 406 or r15 = r27, r26 407 FSH r24 = r18, cnt 408}{.mib; sub r31 = -1, r14 409 BSH r25 = r19, tnc 410 nop 0 411 ;; } 412.Lr8: 413 {.mmi; st8 [rp] = r31, UPD 414 or r14 = r21, r20 415 FSH r26 = r19, cnt 416}{.mib; sub r31 = -1, r15 417 BSH r27 = r16, tnc 418 nop 0 419 ;; } 420.Lr7: 421 {.mmi; st8 [rp] = r31, UPD 422 or r15 = r23, r22 423 FSH r20 = r16, cnt 424}{.mib; sub r31 = -1, r14 425 BSH r21 = r17, tnc 426 nop 0 427 ;; } 428.Lr6: st8 [rp] = r31, UPD 429 or r14 = r25, r24 430 FSH r22 = r17, cnt 431 sub r31 = -1, r15 432 ;; 433.Lr5: st8 [rp] = r31, UPD 434 or r15 = r27, r26 435 sub r31 = -1, r14 436 ;; 437.Lr4: st8 [rp] = r31, UPD 438 or r14 = r21, r20 439 sub r31 = -1, r15 440 ;; 441.Lr3: st8 [rp] = r31, UPD 442 sub r31 = -1, r14 443 ;; 444.Lr2: st8 [rp] = r31, UPD 445 sub r31 = -1, r22 446 ;; 447.Lr1: st8 [rp] = r31, UPD C M23 448 mov ar.lc = r2 C I0 449 br.ret.sptk.many b0 C B 450EPILOGUE(func) 451ASM_END() 452