1dnl IA-64 mpn_bdiv_dbm1. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2008, 2009 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C Itanium: 4 26C Itanium 2: 2 27 28C TODO 29C * Optimize feed-in and wind-down code, both for speed and code size. 30 31C INPUT PARAMETERS 32define(`rp', `r32') 33define(`up', `r33') 34define(`n', `r34') 35define(`bd', `r35') 36 37ASM_START() 38PROLOGUE(mpn_bdiv_dbm1c) 39 .prologue 40 .save ar.lc, r2 41 .body 42 43ifdef(`HAVE_ABI_32', 44` addp4 rp = 0, rp C M I 45 addp4 up = 0, up C M I 46 zxt4 n = n C I 47 ;; 48') 49{.mmb 50 mov r15 = r36 C M I 51 ldf8 f9 = [up], 8 C M 52 nop.b 0 C B 53} 54.Lcommon: 55{.mii 56 adds r16 = -1, n C M I 57 mov r2 = ar.lc C I0 58 and r14 = 3, n C M I 59 ;; 60} 61{.mii 62 setf.sig f6 = bd C M2 M3 63 shr.u r31 = r16, 2 C I0 64 cmp.eq p10, p0 = 0, r14 C M I 65} 66{.mii 67 nop.m 0 C M 68 cmp.eq p11, p0 = 2, r14 C M I 69 cmp.eq p12, p0 = 3, r14 C M I 70 ;; 71} 72{.mii 73 cmp.ne p6, p7 = r0, r0 C M I 74 mov.i ar.lc = r31 C I0 75 cmp.ne p8, p9 = r0, r0 C M I 76} 77{.bbb 78 (p10) br.dptk .Lb00 C B 79 (p11) br.dptk .Lb10 C B 80 (p12) br.dptk .Lb11 C B 81 ;; 82} 83 84.Lb01: br.cloop.dptk .grt1 85 ;; 86 xma.l f38 = f9, f6, f0 87 xma.hu f39 = f9, f6, f0 88 ;; 89 getf.sig r26 = f38 90 getf.sig r27 = f39 91 br .Lcj1 92 93.grt1: ldf8 f10 = [r33], 8 94 ;; 95 ldf8 f11 = [r33], 8 96 ;; 97 ldf8 f12 = [r33], 8 98 ;; 99 xma.l f38 = f9, f6, f0 100 xma.hu f39 = f9, f6, f0 101 ;; 102 ldf8 f13 = [r33], 8 103 ;; 104 xma.l f32 = f10, f6, f0 105 xma.hu f33 = f10, f6, f0 106 br.cloop.dptk .grt5 107 108 ;; 109 getf.sig r26 = f38 110 xma.l f34 = f11, f6, f0 111 xma.hu f35 = f11, f6, f0 112 ;; 113 getf.sig r27 = f39 114 ;; 115 getf.sig r20 = f32 116 xma.l f36 = f12, f6, f0 117 xma.hu f37 = f12, f6, f0 118 ;; 119 getf.sig r21 = f33 120 ;; 121 getf.sig r22 = f34 122 xma.l f38 = f13, f6, f0 123 xma.hu f39 = f13, f6, f0 124 br .Lcj5 125 126.grt5: ldf8 f10 = [r33], 8 127 ;; 128 getf.sig r26 = f38 129 xma.l f34 = f11, f6, f0 130 xma.hu f35 = f11, f6, f0 131 ;; 132 getf.sig r27 = f39 133 ldf8 f11 = [r33], 8 134 ;; 135 getf.sig r20 = f32 136 xma.l f36 = f12, f6, f0 137 xma.hu f37 = f12, f6, f0 138 ;; 139 getf.sig r21 = f33 140 ldf8 f12 = [r33], 8 141 ;; 142 getf.sig r22 = f34 143 xma.l f38 = f13, f6, f0 144 xma.hu f39 = f13, f6, f0 145 br .LL01 146 147.Lb10: ldf8 f13 = [r33], 8 148 br.cloop.dptk .grt2 149 ;; 150 151 xma.l f36 = f9, f6, f0 152 xma.hu f37 = f9, f6, f0 153 ;; 154 xma.l f38 = f13, f6, f0 155 xma.hu f39 = f13, f6, f0 156 ;; 157 getf.sig r24 = f36 158 ;; 159 getf.sig r25 = f37 160 ;; 161 getf.sig r26 = f38 162 ;; 163 getf.sig r27 = f39 164 br .Lcj2 165 166.grt2: ldf8 f10 = [r33], 8 167 ;; 168 ldf8 f11 = [r33], 8 169 ;; 170 xma.l f36 = f9, f6, f0 171 xma.hu f37 = f9, f6, f0 172 ;; 173 ldf8 f12 = [r33], 8 174 ;; 175 xma.l f38 = f13, f6, f0 176 xma.hu f39 = f13, f6, f0 177 ;; 178 ldf8 f13 = [r33], 8 179 ;; 180 getf.sig r24 = f36 181 xma.l f32 = f10, f6, f0 182 xma.hu f33 = f10, f6, f0 183 br.cloop.dptk .grt6 184 185 getf.sig r25 = f37 186 ;; 187 getf.sig r26 = f38 188 xma.l f34 = f11, f6, f0 189 xma.hu f35 = f11, f6, f0 190 ;; 191 getf.sig r27 = f39 192 ;; 193 getf.sig r20 = f32 194 xma.l f36 = f12, f6, f0 195 xma.hu f37 = f12, f6, f0 196 br .Lcj6 197 198.grt6: getf.sig r25 = f37 199 ldf8 f10 = [r33], 8 200 ;; 201 getf.sig r26 = f38 202 xma.l f34 = f11, f6, f0 203 xma.hu f35 = f11, f6, f0 204 ;; 205 getf.sig r27 = f39 206 ldf8 f11 = [r33], 8 207 ;; 208 getf.sig r20 = f32 209 xma.l f36 = f12, f6, f0 210 xma.hu f37 = f12, f6, f0 211 br .LL10 212 213 214.Lb11: ldf8 f12 = [r33], 8 215 ;; 216 ldf8 f13 = [r33], 8 217 br.cloop.dptk .grt3 218 ;; 219 220 xma.l f34 = f9, f6, f0 221 xma.hu f35 = f9, f6, f0 222 ;; 223 xma.l f36 = f12, f6, f0 224 xma.hu f37 = f12, f6, f0 225 ;; 226 getf.sig r22 = f34 227 xma.l f38 = f13, f6, f0 228 xma.hu f39 = f13, f6, f0 229 ;; 230 getf.sig r23 = f35 231 ;; 232 getf.sig r24 = f36 233 ;; 234 getf.sig r25 = f37 235 ;; 236 getf.sig r26 = f38 237 br .Lcj3 238 239.grt3: ldf8 f10 = [r33], 8 240 ;; 241 xma.l f34 = f9, f6, f0 242 xma.hu f35 = f9, f6, f0 243 ;; 244 ldf8 f11 = [r33], 8 245 ;; 246 xma.l f36 = f12, f6, f0 247 xma.hu f37 = f12, f6, f0 248 ;; 249 ldf8 f12 = [r33], 8 250 ;; 251 getf.sig r22 = f34 252 xma.l f38 = f13, f6, f0 253 xma.hu f39 = f13, f6, f0 254 ;; 255 getf.sig r23 = f35 256 ldf8 f13 = [r33], 8 257 ;; 258 getf.sig r24 = f36 259 xma.l f32 = f10, f6, f0 260 xma.hu f33 = f10, f6, f0 261 br.cloop.dptk .grt7 262 263 getf.sig r25 = f37 264 ;; 265 getf.sig r26 = f38 266 xma.l f34 = f11, f6, f0 267 xma.hu f35 = f11, f6, f0 268 br .Lcj7 269 270.grt7: getf.sig r25 = f37 271 ldf8 f10 = [r33], 8 272 ;; 273 getf.sig r26 = f38 274 xma.l f34 = f11, f6, f0 275 xma.hu f35 = f11, f6, f0 276 br .LL11 277 278 279.Lb00: ldf8 f11 = [r33], 8 280 ;; 281 ldf8 f12 = [r33], 8 282 ;; 283 ldf8 f13 = [r33], 8 284 br.cloop.dptk .grt4 285 ;; 286 287 xma.l f32 = f9, f6, f0 288 xma.hu f33 = f9, f6, f0 289 ;; 290 xma.l f34 = f11, f6, f0 291 xma.hu f35 = f11, f6, f0 292 ;; 293 getf.sig r20 = f32 294 xma.l f36 = f12, f6, f0 295 xma.hu f37 = f12, f6, f0 296 ;; 297 getf.sig r21 = f33 298 ;; 299 getf.sig r22 = f34 300 xma.l f38 = f13, f6, f0 301 xma.hu f39 = f13, f6, f0 302 ;; 303 getf.sig r23 = f35 304 ;; 305 getf.sig r24 = f36 306 br .Lcj4 307 308.grt4: xma.l f32 = f9, f6, f0 309 xma.hu f33 = f9, f6, f0 310 ;; 311 ldf8 f10 = [r33], 8 312 ;; 313 xma.l f34 = f11, f6, f0 314 xma.hu f35 = f11, f6, f0 315 ;; 316 ldf8 f11 = [r33], 8 317 ;; 318 getf.sig r20 = f32 319 xma.l f36 = f12, f6, f0 320 xma.hu f37 = f12, f6, f0 321 ;; 322 getf.sig r21 = f33 323 ldf8 f12 = [r33], 8 324 ;; 325 getf.sig r22 = f34 326 xma.l f38 = f13, f6, f0 327 xma.hu f39 = f13, f6, f0 328 ;; 329 getf.sig r23 = f35 330 ldf8 f13 = [r33], 8 331 ;; 332 getf.sig r24 = f36 333 xma.l f32 = f10, f6, f0 334 xma.hu f33 = f10, f6, f0 335 br.cloop.dptk .LL00 336 br .Lcj8 337 338C *** MAIN LOOP START *** 339 ALIGN(32) 340.Ltop: 341 .pred.rel "mutex",p6,p7 342C .mfi 343 getf.sig r24 = f36 344 xma.l f32 = f10, f6, f0 345 (p6) sub r15 = r19, r27, 1 346C .mfi 347 st8 [r32] = r19, 8 348 xma.hu f33 = f10, f6, f0 349 (p7) sub r15 = r19, r27 350 ;; 351.LL00: 352C .mfi 353 getf.sig r25 = f37 354 nop.f 0 355 cmp.ltu p6, p7 = r15, r20 356C .mib 357 ldf8 f10 = [r33], 8 358 sub r16 = r15, r20 359 nop.b 0 360 ;; 361 362C .mfi 363 getf.sig r26 = f38 364 xma.l f34 = f11, f6, f0 365 (p6) sub r15 = r16, r21, 1 366C .mfi 367 st8 [r32] = r16, 8 368 xma.hu f35 = f11, f6, f0 369 (p7) sub r15 = r16, r21 370 ;; 371.LL11: 372C .mfi 373 getf.sig r27 = f39 374 nop.f 0 375 cmp.ltu p6, p7 = r15, r22 376C .mib 377 ldf8 f11 = [r33], 8 378 sub r17 = r15, r22 379 nop.b 0 380 ;; 381 382C .mfi 383 getf.sig r20 = f32 384 xma.l f36 = f12, f6, f0 385 (p6) sub r15 = r17, r23, 1 386C .mfi 387 st8 [r32] = r17, 8 388 xma.hu f37 = f12, f6, f0 389 (p7) sub r15 = r17, r23 390 ;; 391.LL10: 392C .mfi 393 getf.sig r21 = f33 394 nop.f 0 395 cmp.ltu p6, p7 = r15, r24 396C .mib 397 ldf8 f12 = [r33], 8 398 sub r18 = r15, r24 399 nop.b 0 400 ;; 401 402C .mfi 403 getf.sig r22 = f34 404 xma.l f38 = f13, f6, f0 405 (p6) sub r15 = r18, r25, 1 406C .mfi 407 st8 [r32] = r18, 8 408 xma.hu f39 = f13, f6, f0 409 (p7) sub r15 = r18, r25 410 ;; 411.LL01: 412C .mfi 413 getf.sig r23 = f35 414 nop.f 0 415 cmp.ltu p6, p7 = r15, r26 416C .mib 417 ldf8 f13 = [r33], 8 418 sub r19 = r15, r26 419 br.cloop.sptk.few .Ltop 420C *** MAIN LOOP END *** 421 ;; 422 423 getf.sig r24 = f36 424 xma.l f32 = f10, f6, f0 425 (p6) sub r15 = r19, r27, 1 426 st8 [r32] = r19, 8 427 xma.hu f33 = f10, f6, f0 428 (p7) sub r15 = r19, r27 429 ;; 430.Lcj8: getf.sig r25 = f37 431 cmp.ltu p6, p7 = r15, r20 432 sub r16 = r15, r20 433 ;; 434 getf.sig r26 = f38 435 xma.l f34 = f11, f6, f0 436 (p6) sub r15 = r16, r21, 1 437 st8 [r32] = r16, 8 438 xma.hu f35 = f11, f6, f0 439 (p7) sub r15 = r16, r21 440 ;; 441.Lcj7: getf.sig r27 = f39 442 cmp.ltu p6, p7 = r15, r22 443 sub r17 = r15, r22 444 ;; 445 getf.sig r20 = f32 446 xma.l f36 = f12, f6, f0 447 (p6) sub r15 = r17, r23, 1 448 st8 [r32] = r17, 8 449 xma.hu f37 = f12, f6, f0 450 (p7) sub r15 = r17, r23 451 ;; 452.Lcj6: getf.sig r21 = f33 453 cmp.ltu p6, p7 = r15, r24 454 sub r18 = r15, r24 455 ;; 456 getf.sig r22 = f34 457 xma.l f38 = f13, f6, f0 458 (p6) sub r15 = r18, r25, 1 459 st8 [r32] = r18, 8 460 xma.hu f39 = f13, f6, f0 461 (p7) sub r15 = r18, r25 462 ;; 463.Lcj5: getf.sig r23 = f35 464 cmp.ltu p6, p7 = r15, r26 465 sub r19 = r15, r26 466 ;; 467 getf.sig r24 = f36 468 (p6) sub r15 = r19, r27, 1 469 st8 [r32] = r19, 8 470 (p7) sub r15 = r19, r27 471 ;; 472.Lcj4: getf.sig r25 = f37 473 cmp.ltu p6, p7 = r15, r20 474 sub r16 = r15, r20 475 ;; 476 getf.sig r26 = f38 477 (p6) sub r15 = r16, r21, 1 478 st8 [r32] = r16, 8 479 (p7) sub r15 = r16, r21 480 ;; 481.Lcj3: getf.sig r27 = f39 482 cmp.ltu p6, p7 = r15, r22 483 sub r17 = r15, r22 484 ;; 485 (p6) sub r15 = r17, r23, 1 486 st8 [r32] = r17, 8 487 (p7) sub r15 = r17, r23 488 ;; 489.Lcj2: cmp.ltu p6, p7 = r15, r24 490 sub r18 = r15, r24 491 ;; 492 (p6) sub r15 = r18, r25, 1 493 st8 [r32] = r18, 8 494 (p7) sub r15 = r18, r25 495 ;; 496.Lcj1: cmp.ltu p6, p7 = r15, r26 497 sub r19 = r15, r26 498 ;; 499 (p6) sub r8 = r19, r27, 1 500 st8 [r32] = r19 501 (p7) sub r8 = r19, r27 502 mov ar.lc = r2 503 br.ret.sptk.many b0 504EPILOGUE() 505ASM_END() 506