1 /* Copyright (C) 2006-2019 Free Software Foundation, Inc. 2 3 This file is free software; you can redistribute it and/or modify it under 4 the terms of the GNU General Public License as published by the Free 5 Software Foundation; either version 3 of the License, or (at your option) 6 any later version. 7 8 This file is distributed in the hope that it will be useful, but WITHOUT 9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 for more details. 12 13 Under Section 7 of GPL version 3, you are granted additional 14 permissions described in the GCC Runtime Library Exception, version 15 3.1, as published by the Free Software Foundation. 16 17 You should have received a copy of the GNU General Public License and 18 a copy of the GCC Runtime Library Exception along with this program; 19 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20 <http://www.gnu.org/licenses/>. */ 21 22 #ifndef _VMX2SPU_H_ 23 #define _VMX2SPU_H_ 1 24 25 #ifdef __cplusplus 26 27 #ifdef __SPU__ 28 29 #include <spu_intrinsics.h> 30 #include <vec_types.h> 31 32 /* This file maps generic VMX intrinsics and predicates to the SPU using 33 * overloaded C++ functions. 34 */ 35 36 /************************************************************************ 37 * INTRINSICS 38 ************************************************************************/ 39 40 /* vec_abs (vector absolute value) 41 * ======= 42 */ 43 static inline vec_char16 vec_abs(vec_char16 a) 44 { 45 vec_char16 minus_a; 46 47 minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101)); 48 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 49 } 50 51 static inline vec_short8 vec_abs(vec_short8 a) 52 { 53 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 54 } 55 56 static inline vec_int4 vec_abs(vec_int4 a) 57 { 58 return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 59 } 60 61 static inline vec_float4 vec_abs(vec_float4 a) 62 { 63 return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1))); 64 } 65 66 /* vec_abss (vector absolute value saturate) 67 * ======== 68 */ 69 static inline vec_char16 vec_abss(vec_char16 a) 70 { 71 vec_char16 minus_a; 72 73 minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)), 74 (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1))); 75 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 76 } 77 78 static inline vec_short8 vec_abss(vec_short8 a) 79 { 80 vec_short8 minus_a; 81 82 minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000})))); 83 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 84 } 85 86 static inline vec_int4 vec_abss(vec_int4 a) 87 { 88 vec_int4 minus_a; 89 90 minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000})))); 91 return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 92 } 93 94 95 /* vec_add (vector add) 96 * ======= 97 */ 98 static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b) 99 { 100 return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)), 101 spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)), 102 spu_splats((unsigned short)(0xFF00))))); 103 } 104 105 static inline vec_char16 vec_add(vec_char16 a, vec_char16 b) 106 { 107 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 108 } 109 110 static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b) 111 { 112 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 113 } 114 115 static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b) 116 { 117 return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 118 } 119 120 static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b) 121 { 122 return (spu_add(a, b)); 123 } 124 125 static inline vec_short8 vec_add(vec_short8 a, vec_short8 b) 126 { 127 return (spu_add(a, b)); 128 } 129 130 static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b) 131 { 132 return (spu_add((vec_short8)(a), b)); 133 } 134 135 static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b) 136 { 137 return (spu_add(a, (vec_short8)(b))); 138 } 139 140 static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b) 141 { 142 return (spu_add(a, b)); 143 } 144 145 static inline vec_int4 vec_add(vec_int4 a, vec_int4 b) 146 { 147 return (spu_add(a, b)); 148 } 149 150 static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b) 151 { 152 return (spu_add((vec_int4)(a), b)); 153 } 154 155 static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b) 156 { 157 return (spu_add(a, (vec_int4)(b))); 158 } 159 160 static inline vec_float4 vec_add(vec_float4 a, vec_float4 b) 161 { 162 return (spu_add(a, b)); 163 } 164 165 /* vec_addc (vector add carryout unsigned word) 166 * ======== 167 */ 168 #define vec_addc(_a, _b) spu_genc(_a, _b) 169 170 /* vec_adds (vector add saturated) 171 * ======== 172 */ 173 static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b) 174 { 175 vec_uchar16 s1, s2, s, d; 176 177 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 178 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 179 s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 180 8, 24, 10, 26, 12, 28, 14, 30})); 181 d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 182 9, 25, 11, 27, 13, 29, 15, 31})); 183 return (spu_or(d, spu_cmpeq(s, 1))); 184 } 185 186 static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b) 187 { 188 vec_uchar16 s1, s2, s, d; 189 190 s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 191 s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 192 s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 193 9, 25, 11, 27, 13, 29, 15, 31})); 194 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F)); 195 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F)); 196 return ((vec_char16)(d)); 197 } 198 199 static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b) 200 { 201 return (vec_adds((vec_char16)(a), b)); 202 } 203 204 static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b) 205 { 206 return (vec_adds(a, (vec_char16)(b))); 207 } 208 209 static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b) 210 { 211 vec_ushort8 s, d; 212 213 s = spu_add(a, b); 214 d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15)); 215 return (d); 216 } 217 218 static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b) 219 { 220 vec_short8 s, d; 221 222 s = spu_add(a, b); 223 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15))); 224 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15))); 225 return (d); 226 } 227 228 static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b) 229 { 230 return (vec_adds((vec_short8)(a), b)); 231 } 232 233 static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b) 234 { 235 return (vec_adds(a, (vec_short8)(b))); 236 } 237 238 static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b) 239 { 240 return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31))); 241 } 242 243 static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b) 244 { 245 vec_int4 s, d; 246 247 s = spu_add(a, b); 248 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31)); 249 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31)); 250 return (d); 251 } 252 253 static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b) 254 { 255 return (vec_adds((vec_int4)(a), b)); 256 } 257 258 static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b) 259 { 260 return (vec_adds(a, (vec_int4)(b))); 261 } 262 263 /* vec_and (vector logical and) 264 * ======= 265 */ 266 static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b) 267 { 268 return (spu_and(a, b)); 269 } 270 271 static inline vec_char16 vec_and(vec_char16 a, vec_char16 b) 272 { 273 return (spu_and(a, b)); 274 } 275 276 static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b) 277 { 278 return (spu_and((vec_char16)(a), b)); 279 } 280 281 static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b) 282 { 283 return (spu_and(a, (vec_char16)(b))); 284 } 285 286 static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b) 287 { 288 return (spu_and(a, b)); 289 } 290 291 static inline vec_short8 vec_and(vec_short8 a, vec_short8 b) 292 { 293 return (spu_and(a, b)); 294 } 295 296 static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b) 297 { 298 return (spu_and((vec_short8)(a), b)); 299 } 300 301 static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b) 302 { 303 return (spu_and(a, (vec_short8)(b))); 304 } 305 306 static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b) 307 { 308 return (spu_and(a, b)); 309 } 310 311 static inline vec_int4 vec_and(vec_int4 a, vec_int4 b) 312 { 313 return (spu_and(a, b)); 314 } 315 316 static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b) 317 { 318 return (spu_and((vec_int4)(a), b)); 319 } 320 321 static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b) 322 { 323 return (spu_and(a, (vec_int4)(b))); 324 } 325 326 static inline vec_float4 vec_and(vec_float4 a, vec_float4 b) 327 { 328 return (spu_and(a, b)); 329 } 330 331 static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b) 332 { 333 return (spu_and((vec_float4)(a),b)); 334 } 335 336 static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b) 337 { 338 return (spu_and(a, (vec_float4)(b))); 339 } 340 341 342 /* vec_andc (vector logical and with complement) 343 * ======== 344 */ 345 static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b) 346 { 347 return (spu_andc(a, b)); 348 } 349 350 static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b) 351 { 352 return (spu_andc(a, b)); 353 } 354 355 static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b) 356 { 357 return (spu_andc((vec_char16)(a), b)); 358 } 359 360 static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b) 361 { 362 return (spu_andc(a, (vec_char16)(b))); 363 } 364 365 static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b) 366 { 367 return (spu_andc(a, b)); 368 } 369 370 static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b) 371 { 372 return (spu_andc(a, b)); 373 } 374 375 static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b) 376 { 377 return (spu_andc((vec_short8)(a), b)); 378 } 379 380 static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b) 381 { 382 return (spu_andc(a, (vec_short8)(b))); 383 } 384 385 static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b) 386 { 387 return (spu_andc(a, b)); 388 } 389 390 static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b) 391 { 392 return (spu_andc(a, b)); 393 } 394 395 static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b) 396 { 397 return (spu_andc((vec_int4)(a), b)); 398 } 399 400 static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b) 401 { 402 return (spu_andc(a, (vec_int4)(b))); 403 } 404 405 static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b) 406 { 407 return (spu_andc(a,b)); 408 } 409 410 static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b) 411 { 412 return (spu_andc((vec_float4)(a),b)); 413 } 414 415 static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b) 416 { 417 return (spu_andc(a, (vec_float4)(b))); 418 } 419 420 /* vec_avg (vector average) 421 * ======= 422 */ 423 static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b) 424 { 425 return (spu_avg(a, b)); 426 } 427 428 static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b) 429 { 430 return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)), 431 (vec_uchar16)(spu_and(spu_xor(a,b), 0x80))))); 432 } 433 434 static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b) 435 { 436 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 437 spu_and(spu_or(a, b), 1))); 438 } 439 440 static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b) 441 { 442 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 443 spu_and(spu_or(a, b), 1))); 444 } 445 446 static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b) 447 { 448 return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 449 spu_and(spu_or(a, b), 1))); 450 } 451 452 static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b) 453 { 454 return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 455 spu_and(spu_or(a, b), 1))); 456 } 457 458 459 /* vec_ceil (vector ceiling) 460 * ======== 461 */ 462 static inline vec_float4 vec_ceil(vec_float4 a) 463 { 464 vec_int4 exp; 465 vec_uint4 mask; 466 467 a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF)))); 468 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 469 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 470 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 471 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 472 473 return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 474 } 475 476 477 /* vec_cmpb (vector compare bounds floating-point) 478 * ======== 479 */ 480 static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b) 481 { 482 vec_int4 b0 = (vec_int4)spu_splats(0x80000000); 483 vec_int4 b1 = (vec_int4)spu_splats(0x40000000); 484 485 return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0), 486 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1))); 487 } 488 489 /* vec_cmpeq (vector compare equal) 490 * ========= 491 */ 492 #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b) 493 494 495 /* vec_cmpge (vector compare greater than or equal) 496 * ========= 497 */ 498 static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b) 499 { 500 return (spu_xor(spu_cmpgt(b, a), -1)); 501 } 502 503 504 /* vec_cmpgt (vector compare greater than) 505 * ========= 506 */ 507 #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b) 508 509 510 /* vec_cmple (vector compare less than or equal) 511 * ========= 512 */ 513 static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b) 514 { 515 return (spu_xor(spu_cmpgt(a, b), -1)); 516 } 517 518 519 /* vec_cmplt (vector compare less than) 520 * ========= 521 */ 522 #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a) 523 524 525 /* vec_ctf (vector convert from fixed-point word) 526 * ======= 527 */ 528 #define vec_ctf(_a, _b) spu_convtf(_a, _b) 529 530 531 /* vec_cts (vector convert to signed fixed-point word saturate) 532 * ======= 533 */ 534 #define vec_cts(_a, _b) spu_convts(_a, _b) 535 536 537 /* vec_ctu (vector convert to unsigned fixed-point word saturate) 538 * ======= 539 */ 540 #define vec_ctu(_a, _b) spu_convtu(_a, _b) 541 542 543 /* vec_dss (vector data stream stop) 544 * ======= 545 */ 546 #define vec_dss(_a) 547 548 549 /* vec_dssall (vector data stream stop all) 550 * ========== 551 */ 552 #define vec_dssall() 553 554 555 /* vec_dst (vector data stream touch) 556 * ======= 557 */ 558 #define vec_dst(_a, _b, _c) 559 560 561 /* vec_dstst (vector data stream touch for store) 562 * ========= 563 */ 564 #define vec_dstst(_a, _b, _c) 565 566 567 /* vec_dststt (vector data stream touch for store transient) 568 * ========== 569 */ 570 #define vec_dststt(_a, _b, _c) 571 572 573 /* vec_dstt (vector data stream touch transient) 574 * ======== 575 */ 576 #define vec_dstt(_a, _b, _c) 577 578 579 /* vec_expte (vector is 2 raised tp the exponent estimate floating-point) 580 * ========= 581 */ 582 static inline vec_float4 vec_expte(vec_float4 a) 583 { 584 vec_float4 bias, frac, exp; 585 vec_int4 ia; 586 587 bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31))); 588 ia = spu_convts(spu_add(a, bias), 0); 589 frac = spu_sub(spu_convtf(ia, 0), a); 590 exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23)); 591 592 return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)), 593 frac, spu_splats(1.0f)), exp)); 594 } 595 596 597 /* vec_floor (vector floor) 598 * ========= 599 */ 600 static inline vec_float4 vec_floor(vec_float4 a) 601 { 602 vec_int4 exp; 603 vec_uint4 mask; 604 605 a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF)))); 606 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 607 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 608 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 609 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 610 611 return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 612 } 613 614 615 /* vec_ld (vector load indexed) 616 * ====== 617 */ 618 static inline vec_uchar16 vec_ld(int a, unsigned char *b) 619 { 620 return (*((vec_uchar16 *)(b+a))); 621 } 622 623 static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b) 624 { 625 return (*((vec_uchar16 *)((unsigned char *)(b)+a))); 626 } 627 628 static inline vec_char16 vec_ld(int a, signed char *b) 629 { 630 return (*((vec_char16 *)(b+a))); 631 } 632 633 static inline vec_char16 vec_ld(int a, vec_char16 *b) 634 { 635 return (*((vec_char16 *)((signed char *)(b)+a))); 636 } 637 638 static inline vec_ushort8 vec_ld(int a, unsigned short *b) 639 { 640 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 641 } 642 643 static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b) 644 { 645 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 646 } 647 648 static inline vec_short8 vec_ld(int a, signed short *b) 649 { 650 return (*((vec_short8 *)((unsigned char *)(b)+a))); 651 } 652 653 static inline vec_short8 vec_ld(int a, vec_short8 *b) 654 { 655 return (*((vec_short8 *)((signed char *)(b)+a))); 656 } 657 658 static inline vec_uint4 vec_ld(int a, unsigned int *b) 659 { 660 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 661 } 662 663 static inline vec_uint4 vec_ld(int a, vec_uint4 *b) 664 { 665 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 666 } 667 668 static inline vec_int4 vec_ld(int a, signed int *b) 669 { 670 return (*((vec_int4 *)((unsigned char *)(b)+a))); 671 } 672 673 static inline vec_int4 vec_ld(int a, vec_int4 *b) 674 { 675 return (*((vec_int4 *)((signed char *)(b)+a))); 676 } 677 678 static inline vec_float4 vec_ld(int a, float *b) 679 { 680 return (*((vec_float4 *)((unsigned char *)(b)+a))); 681 } 682 683 static inline vec_float4 vec_ld(int a, vec_float4 *b) 684 { 685 return (*((vec_float4 *)((unsigned char *)(b)+a))); 686 } 687 688 /* vec_lde (vector load element indexed) 689 * ======= 690 */ 691 static inline vec_uchar16 vec_lde(int a, unsigned char *b) 692 { 693 return (*((vec_uchar16 *)(b+a))); 694 } 695 696 static inline vec_char16 vec_lde(int a, signed char *b) 697 { 698 return (*((vec_char16 *)(b+a))); 699 } 700 701 static inline vec_ushort8 vec_lde(int a, unsigned short *b) 702 { 703 return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 704 } 705 706 static inline vec_short8 vec_lde(int a, signed short *b) 707 { 708 return (*((vec_short8 *)((unsigned char *)(b)+a))); 709 } 710 711 712 static inline vec_uint4 vec_lde(int a, unsigned int *b) 713 { 714 return (*((vec_uint4 *)((unsigned char *)(b)+a))); 715 } 716 717 static inline vec_int4 vec_lde(int a, signed int *b) 718 { 719 return (*((vec_int4 *)((unsigned char *)(b)+a))); 720 } 721 722 723 static inline vec_float4 vec_lde(int a, float *b) 724 { 725 return (*((vec_float4 *)((unsigned char *)(b)+a))); 726 } 727 728 /* vec_ldl (vector load indexed LRU) 729 * ======= 730 */ 731 #define vec_ldl(_a, _b) vec_ld(_a, _b) 732 733 734 /* vec_loge (vector log2 estimate floating-point) 735 * ======== 736 */ 737 static inline vec_float4 vec_loge(vec_float4 a) 738 { 739 vec_int4 exp; 740 vec_float4 frac; 741 742 exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127); 743 frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23))); 744 745 return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)), 746 frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f)))); 747 } 748 749 750 /* vec_lvsl (vector load for shift left) 751 * ======== 752 */ 753 static inline vec_uchar16 vec_lvsl(int a, unsigned char *b) 754 { 755 return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))), 756 ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607, 757 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}))); 758 } 759 760 static inline vec_uchar16 vec_lvsl(int a, signed char *b) 761 { 762 return (vec_lvsl(a, (unsigned char *)b)); 763 } 764 765 static inline vec_uchar16 vec_lvsl(int a, unsigned short *b) 766 { 767 return (vec_lvsl(a, (unsigned char *)b)); 768 } 769 770 static inline vec_uchar16 vec_lvsl(int a, short *b) 771 { 772 return (vec_lvsl(a, (unsigned char *)b)); 773 } 774 775 static inline vec_uchar16 vec_lvsl(int a, unsigned int *b) 776 { 777 return (vec_lvsl(a, (unsigned char *)b)); 778 } 779 780 static inline vec_uchar16 vec_lvsl(int a, int *b) 781 { 782 return (vec_lvsl(a, (unsigned char *)b)); 783 } 784 785 static inline vec_uchar16 vec_lvsl(int a, float *b) 786 { 787 return (vec_lvsl(a, (unsigned char *)b)); 788 } 789 790 791 /* vec_lvsr (vector load for shift right) 792 * ======== 793 */ 794 static inline vec_uchar16 vec_lvsr(int a, unsigned char *b) 795 { 796 return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617, 797 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}), 798 (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF)))))); 799 } 800 801 static inline vec_uchar16 vec_lvsr(int a, signed char *b) 802 { 803 return (vec_lvsr(a, (unsigned char *)b)); 804 } 805 806 static inline vec_uchar16 vec_lvsr(int a, unsigned short *b) 807 { 808 return (vec_lvsr(a, (unsigned char *)b)); 809 } 810 811 static inline vec_uchar16 vec_lvsr(int a, short *b) 812 { 813 return (vec_lvsr(a, (unsigned char *)b)); 814 } 815 816 static inline vec_uchar16 vec_lvsr(int a, unsigned int *b) 817 { 818 return (vec_lvsr(a, (unsigned char *)b)); 819 } 820 821 static inline vec_uchar16 vec_lvsr(int a, int *b) 822 { 823 return (vec_lvsr(a, (unsigned char *)b)); 824 } 825 826 static inline vec_uchar16 vec_lvsr(int a, float *b) 827 { 828 return (vec_lvsr(a, (unsigned char *)b)); 829 } 830 831 /* vec_madd (vector multiply add) 832 * ======== 833 */ 834 #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c) 835 836 837 838 /* vec_madds (vector multiply add saturate) 839 * ========= 840 */ 841 static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c) 842 { 843 return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)), 844 (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)), 845 ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})))); 846 } 847 848 /* vec_max (vector maximum) 849 * ======= 850 */ 851 static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b) 852 { 853 return (spu_sel(b, a, spu_cmpgt(a, b))); 854 } 855 856 static inline vec_char16 vec_max(vec_char16 a, vec_char16 b) 857 { 858 return (spu_sel(b, a, spu_cmpgt(a, b))); 859 } 860 861 static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b) 862 { 863 return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b))); 864 } 865 866 static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b) 867 { 868 return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b)))); 869 } 870 871 static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b) 872 { 873 return (spu_sel(b, a, spu_cmpgt(a, b))); 874 } 875 876 static inline vec_short8 vec_max(vec_short8 a, vec_short8 b) 877 { 878 return (spu_sel(b, a, spu_cmpgt(a, b))); 879 } 880 881 static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b) 882 { 883 return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b))); 884 } 885 886 static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b) 887 { 888 return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b)))); 889 } 890 891 static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b) 892 { 893 return (spu_sel(b, a, spu_cmpgt(a, b))); 894 } 895 896 static inline vec_int4 vec_max(vec_int4 a, vec_int4 b) 897 { 898 return (spu_sel(b, a, spu_cmpgt(a, b))); 899 } 900 901 static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b) 902 { 903 return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b))); 904 } 905 906 static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b) 907 { 908 return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b)))); 909 } 910 911 static inline vec_float4 vec_max(vec_float4 a, vec_float4 b) 912 { 913 return (spu_sel(b, a, spu_cmpgt(a, b))); 914 } 915 916 917 /* vec_mergeh (vector merge high) 918 * ========== 919 */ 920 static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b) 921 { 922 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 923 4, 20, 5, 21, 6, 22, 7, 23}))); 924 } 925 926 static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b) 927 { 928 return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 929 4, 20, 5, 21, 6, 22, 7, 23}))); 930 } 931 932 static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b) 933 { 934 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 935 4, 5, 20, 21, 6, 7, 22, 23}))); 936 } 937 938 static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b) 939 { 940 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 941 4, 5, 20, 21, 6, 7, 22, 23}))); 942 } 943 944 static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b) 945 { 946 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 947 4, 5, 6, 7, 20, 21, 22, 23}))); 948 } 949 950 static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b) 951 { 952 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 953 4, 5, 6, 7, 20, 21, 22, 23}))); 954 } 955 956 static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b) 957 { 958 return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 959 4, 5, 6, 7, 20, 21, 22, 23}))); 960 } 961 962 /* vec_mergel (vector merge low) 963 * ========== 964 */ 965 static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b) 966 { 967 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 968 12, 28, 13, 29, 14, 30, 15, 31}))); 969 } 970 971 static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b) 972 { 973 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 974 12, 28, 13, 29, 14, 30, 15, 31}))); 975 } 976 977 static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b) 978 { 979 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 980 12, 13, 28, 29, 14, 15, 30, 31}))); 981 } 982 983 static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b) 984 { 985 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 986 12, 13, 28, 29, 14, 15, 30, 31}))); 987 } 988 989 static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b) 990 { 991 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 992 12, 13, 14, 15, 28, 29, 30, 31}))); 993 } 994 995 static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b) 996 { 997 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 998 12, 13, 14, 15, 28, 29, 30, 31}))); 999 } 1000 1001 static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b) 1002 { 1003 return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 1004 12, 13, 14, 15, 28, 29, 30, 31}))); 1005 } 1006 1007 /* vec_mfvscr (vector move from vector status and control register) 1008 * ========== 1009 */ 1010 static inline vec_ushort8 vec_mfvscr() 1011 { 1012 return ((vec_ushort8)spu_splats(0)); /* not supported */ 1013 } 1014 1015 1016 /* vec_min (vector minimum) 1017 * ======= 1018 */ 1019 static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b) 1020 { 1021 return (spu_sel(a, b, spu_cmpgt(a, b))); 1022 } 1023 1024 static inline vec_char16 vec_min(vec_char16 a, vec_char16 b) 1025 { 1026 return (spu_sel(a, b, spu_cmpgt(a, b))); 1027 } 1028 1029 static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b) 1030 { 1031 return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b))); 1032 } 1033 1034 static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b) 1035 { 1036 return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b)))); 1037 } 1038 1039 static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b) 1040 { 1041 return (spu_sel(a, b, spu_cmpgt(a, b))); 1042 } 1043 1044 static inline vec_short8 vec_min(vec_short8 a, vec_short8 b) 1045 { 1046 return (spu_sel(a, b, spu_cmpgt(a, b))); 1047 } 1048 1049 static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b) 1050 { 1051 return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b))); 1052 } 1053 1054 static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b) 1055 { 1056 return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b)))); 1057 } 1058 1059 static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b) 1060 { 1061 return (spu_sel(a, b, spu_cmpgt(a, b))); 1062 } 1063 1064 static inline vec_int4 vec_min(vec_int4 a, vec_int4 b) 1065 { 1066 return (spu_sel(a, b, spu_cmpgt(a, b))); 1067 } 1068 1069 static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b) 1070 { 1071 return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b))); 1072 } 1073 1074 static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b) 1075 { 1076 return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b)))); 1077 } 1078 1079 static inline vec_float4 vec_min(vec_float4 a, vec_float4 b) 1080 { 1081 return (spu_sel(a, b, spu_cmpgt(a, b))); 1082 } 1083 1084 /* vec_mladd (vector multiply low and add unsigned half word) 1085 * ========= 1086 */ 1087 static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c) 1088 { 1089 return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)), 1090 (vec_short8)(spu_rl((vec_uint4)(b), -16)), 1091 (vec_int4)(spu_rl((vec_uint4)(c), -16))), 1092 spu_madd(a, b, spu_extend(c)), 1093 ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1094 10, 11, 26, 27, 14, 15, 30, 31})))); 1095 } 1096 1097 1098 static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c) 1099 { 1100 return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c)))); 1101 } 1102 1103 static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c) 1104 { 1105 return (vec_mladd((vec_short8)(a), b, c)); 1106 } 1107 1108 static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c) 1109 { 1110 return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c))); 1111 } 1112 1113 1114 /* vec_mradds (vector multiply round and add saturate) 1115 * ========== 1116 */ 1117 static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c) 1118 { 1119 vec_int4 round = (vec_int4)spu_splats(0x4000); 1120 vec_short8 hi, lo; 1121 1122 hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1)); 1123 lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15)); 1124 1125 return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c)); 1126 } 1127 1128 1129 /* vec_msum (vector multiply sum) 1130 * ======== 1131 */ 1132 static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c) 1133 { 1134 vec_ushort8 a1, a2, b1, b2; 1135 vec_uint4 p1, p2; 1136 1137 a1 = spu_and((vec_ushort8)(a), 0xFF); 1138 a2 = spu_rlmask((vec_ushort8)(a), -8); 1139 b1 = spu_and((vec_ushort8)(b), 0xFF); 1140 b2 = spu_rlmask((vec_ushort8)(b), -8); 1141 1142 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1143 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1144 return (spu_add(p2, spu_add(p1, c))); 1145 } 1146 1147 static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c) 1148 { 1149 vec_short8 a1, a2, b1, b2; 1150 vec_int4 p1, p2; 1151 1152 a1 = (vec_short8)(spu_extend(a)); 1153 a2 = spu_rlmaska((vec_short8)(a), -8); 1154 b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF)); 1155 b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8); 1156 1157 p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1158 p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1159 return (spu_add(p2, spu_add(p1, c))); 1160 } 1161 1162 static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1163 { 1164 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1165 } 1166 1167 static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c) 1168 { 1169 return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1170 } 1171 1172 1173 /* vec_msums (vector multiply sum saturate) 1174 * ======== 1175 */ 1176 static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1177 { 1178 vec_uint4 p1, p2; 1179 1180 p1 = spu_mulo(a, b); 1181 p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2)); 1182 1183 return (vec_adds(p2, vec_adds(p1, c))); 1184 } 1185 1186 static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c) 1187 { 1188 return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1189 } 1190 1191 /* vec_mtvscr (vector move to vector status and control register) 1192 * ========== 1193 */ 1194 #define vec_mtvscr(_a) /* not supported */ 1195 1196 1197 /* vec_mule (vector multiply even) 1198 * ======== 1199 */ 1200 static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b) 1201 { 1202 vec_ushort8 hi, lo; 1203 1204 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)), 1205 (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24))); 1206 lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)), 1207 (vec_ushort8)(spu_rlmask((vec_short8)(b), -8))); 1208 1209 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1210 10, 11, 26, 27, 14, 15, 30, 31}))); 1211 } 1212 1213 static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b) 1214 { 1215 vec_short8 hi, lo; 1216 1217 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)), 1218 (vec_short8)(spu_rlmaska((vec_uint4)(b), -24))); 1219 lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)), 1220 (vec_short8)(spu_rlmaska((vec_short8)(b), -8))); 1221 1222 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1223 10, 11, 26, 27, 14, 15, 30, 31}))); 1224 } 1225 1226 static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b) 1227 { 1228 return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16), 1229 (vec_ushort8)spu_rlmask((vec_uint4)(b), -16))); 1230 } 1231 1232 1233 static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b) 1234 { 1235 return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16), 1236 (vec_short8)spu_rlmaska((vec_int4)(b), -16))); 1237 } 1238 1239 1240 /* vec_mulo (vector multiply odd) 1241 * ======== 1242 */ 1243 static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b) 1244 { 1245 vec_ushort8 hi, lo; 1246 1247 hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)), 1248 (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF))); 1249 lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 1250 1251 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1252 10, 11, 26, 27, 14, 15, 30, 31}))); 1253 } 1254 1255 static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b) 1256 { 1257 vec_short8 aa, bb, hi, lo; 1258 1259 aa = spu_extend(a); 1260 bb = spu_extend(b); 1261 1262 hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)), 1263 (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16))); 1264 lo = (vec_short8)spu_mulo(aa, bb); 1265 return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1266 10, 11, 26, 27, 14, 15, 30, 31}))); 1267 } 1268 1269 static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b) 1270 { 1271 return (spu_mulo(a, b)); 1272 } 1273 1274 1275 static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b) 1276 { 1277 return (spu_mulo(a, b)); 1278 } 1279 1280 1281 /* vec_nmsub (vector negative multiply subtract) 1282 * ========= 1283 */ 1284 #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c) 1285 1286 1287 /* vec_nor (vector logical nor) 1288 * ======= 1289 */ 1290 #define vec_nor(_a, _b) spu_nor(_a, _b) 1291 1292 1293 /* vec_or (vector logical or) 1294 * ====== 1295 */ 1296 static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b) 1297 { 1298 return (spu_or(a, b)); 1299 } 1300 1301 static inline vec_char16 vec_or(vec_char16 a, vec_char16 b) 1302 { 1303 return (spu_or(a, b)); 1304 } 1305 1306 static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b) 1307 { 1308 return (spu_or((vec_char16)(a), b)); 1309 } 1310 1311 static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b) 1312 { 1313 return (spu_or(a, (vec_char16)(b))); 1314 } 1315 1316 static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b) 1317 { 1318 return (spu_or(a, b)); 1319 } 1320 1321 static inline vec_short8 vec_or(vec_short8 a, vec_short8 b) 1322 { 1323 return (spu_or(a, b)); 1324 } 1325 1326 static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b) 1327 { 1328 return (spu_or((vec_short8)(a), b)); 1329 } 1330 1331 static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b) 1332 { 1333 return (spu_or(a, (vec_short8)(b))); 1334 } 1335 1336 static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b) 1337 { 1338 return (spu_or(a, b)); 1339 } 1340 1341 static inline vec_int4 vec_or(vec_int4 a, vec_int4 b) 1342 { 1343 return (spu_or(a, b)); 1344 } 1345 1346 static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b) 1347 { 1348 return (spu_or((vec_int4)(a), b)); 1349 } 1350 1351 static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b) 1352 { 1353 return (spu_or(a, (vec_int4)(b))); 1354 } 1355 1356 static inline vec_float4 vec_or(vec_float4 a, vec_float4 b) 1357 { 1358 return (spu_or(a, b)); 1359 } 1360 1361 static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b) 1362 { 1363 return (spu_or((vec_float4)(a),b)); 1364 } 1365 1366 static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b) 1367 { 1368 return (spu_or(a, (vec_float4)(b))); 1369 } 1370 1371 1372 /* vec_pack (vector pack) 1373 * ======== 1374 */ 1375 static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b) 1376 { 1377 return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1378 17, 19, 21, 23, 25, 27, 29, 31}))); 1379 } 1380 1381 static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b) 1382 { 1383 return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1384 17, 19, 21, 23, 25, 27, 29, 31}))); 1385 } 1386 1387 static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b) 1388 { 1389 return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1390 18, 19, 22, 23, 26, 27, 30, 31}))); 1391 } 1392 1393 static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b) 1394 { 1395 return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1396 18, 19, 22, 23, 26, 27, 30, 31}))); 1397 } 1398 1399 1400 /* vec_packpx (vector pack pixel) 1401 * ========== 1402 */ 1403 static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b) 1404 { 1405 vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF)); 1406 vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F)); 1407 1408 return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF), 1409 spu_sl(a, 13), x001F), 1410 spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF), 1411 spu_sl(b, 13), x001F), 1412 ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13, 1413 16, 17, 20, 21, 24, 25, 28, 29})))); 1414 } 1415 1416 1417 /* vec_packs (vector pack saturate) 1418 * ========= 1419 */ 1420 static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b) 1421 { 1422 vec_ushort8 max = spu_splats((unsigned short)0x00FF); 1423 1424 return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)), 1425 spu_sel(b, max, spu_cmpgt(b, 255)), 1426 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1427 17, 19, 21, 23, 25, 27, 29, 31})))); 1428 } 1429 1430 static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b) 1431 { 1432 vec_short8 max = spu_splats((signed short)0x007F); 1433 vec_short8 min = spu_splats((signed short)0xFF80); 1434 1435 return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)), 1436 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)), 1437 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1438 17, 19, 21, 23, 25, 27, 29, 31})))); 1439 } 1440 1441 static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b) 1442 { 1443 vec_uint4 max = spu_splats((unsigned int)0x0000FFFF); 1444 1445 return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)), 1446 spu_sel(b, max, spu_cmpgt(b, max)), 1447 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1448 18, 19, 22, 23, 26, 27, 30, 31})))); 1449 } 1450 1451 static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b) 1452 { 1453 vec_int4 max = spu_splats((signed int)0x00007FFF); 1454 vec_int4 min = spu_splats((signed int)0xFFFF8000); 1455 1456 return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1457 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1458 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1459 18, 19, 22, 23, 26, 27, 30, 31})))); 1460 } 1461 1462 1463 /* vec_packsu (vector pack saturate unsigned) 1464 * ========== 1465 */ 1466 static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b) 1467 { 1468 return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))), 1469 spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))), 1470 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1471 17, 19, 21, 23, 25, 27, 29, 31}))); 1472 } 1473 1474 static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b) 1475 { 1476 vec_short8 max = spu_splats((signed short)0x00FF); 1477 vec_short8 min = spu_splats((signed short)0x0000); 1478 1479 return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)), 1480 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)), 1481 ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1482 17, 19, 21, 23, 25, 27, 29, 31})))); 1483 1484 return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b))); 1485 } 1486 1487 static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b) 1488 { 1489 vec_uint4 max = spu_splats((unsigned int)0xFFFF); 1490 1491 return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))), 1492 spu_or(b, (vec_uint4)(spu_cmpgt(b, max))), 1493 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1494 18, 19, 22, 23, 26, 27, 30, 31}))); 1495 } 1496 1497 static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b) 1498 { 1499 vec_int4 max = spu_splats((signed int)0x0000FFFF); 1500 vec_int4 min = spu_splats((signed int)0x00000000); 1501 1502 return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1503 spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1504 ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1505 18, 19, 22, 23, 26, 27, 30, 31})))); 1506 } 1507 1508 1509 /* vec_perm (vector permute) 1510 * ======== 1511 */ 1512 static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c) 1513 { 1514 return (spu_shuffle(a, b, spu_and(c, 0x1F))); 1515 } 1516 1517 static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c) 1518 { 1519 return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1520 } 1521 1522 static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c) 1523 { 1524 return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1525 } 1526 1527 static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c) 1528 { 1529 return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1530 } 1531 1532 static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c) 1533 { 1534 return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1535 } 1536 1537 static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c) 1538 { 1539 return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1540 } 1541 1542 static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c) 1543 { 1544 return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1545 } 1546 1547 1548 /* vec_re (vector reciprocal estimate) 1549 * ====== 1550 */ 1551 #define vec_re(_a) spu_re(_a) 1552 1553 1554 /* vec_rl (vector rotate left) 1555 * ====== 1556 */ 1557 static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b) 1558 { 1559 vec_ushort8 r1, r2; 1560 1561 r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7)); 1562 r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1563 return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF)))); 1564 } 1565 1566 static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b) 1567 { 1568 return ((vec_char16)(vec_rl((vec_uchar16)(a), b))); 1569 } 1570 1571 static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b) 1572 { 1573 return (spu_rl(a, (vec_short8)(b))); 1574 } 1575 1576 static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b) 1577 { 1578 return (spu_rl(a, (vec_short8)(b))); 1579 } 1580 1581 static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b) 1582 { 1583 return (spu_rl(a, (vec_int4)(b))); 1584 } 1585 1586 static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b) 1587 { 1588 return (spu_rl(a, (vec_int4)(b))); 1589 } 1590 1591 1592 /* vec_round (vector round) 1593 * ========= 1594 */ 1595 static inline vec_float4 vec_round(vec_float4 a) 1596 { 1597 vec_float4 s_half, s_one, d; 1598 vec_uint4 odd; 1599 vec_uint4 msb = spu_splats((unsigned int)0x80000000); 1600 vec_float4 half = spu_splats(0.5f); 1601 vec_int4 exp; 1602 vec_uint4 mask; 1603 1604 s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb)); 1605 a = spu_add(a, s_half); 1606 s_one = spu_add(s_half, s_half); 1607 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 1608 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 1609 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 1610 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 1611 1612 odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1); 1613 s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0)); 1614 s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0), 1615 (vec_float4)spu_cmpeq(odd, 1))); 1616 d = spu_andc(a, (vec_float4)(mask)); 1617 d = spu_sub(d, s_one); 1618 return (d); 1619 } 1620 1621 /* vec_rsqrte (vector reciprocal square root estimate) 1622 * ========== 1623 */ 1624 #define vec_rsqrte(_a) spu_rsqrte(_a) 1625 1626 1627 /* vec_sel (vector select) 1628 * ======= 1629 */ 1630 #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c) 1631 1632 1633 /* vec_sl (vector shift left) 1634 * ====== 1635 */ 1636 static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b) 1637 { 1638 vec_ushort8 hi, lo; 1639 1640 lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF); 1641 hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1642 1643 return ((vec_uchar16)(spu_or(hi, lo))); 1644 } 1645 1646 static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b) 1647 { 1648 return ((vec_char16)(vec_sl((vec_uchar16)(a), b))); 1649 } 1650 1651 static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b) 1652 { 1653 return (spu_sl(a, spu_and(b, 15))); 1654 } 1655 1656 static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b) 1657 { 1658 return (spu_sl(a, spu_and((vec_ushort8)(b), 15))); 1659 } 1660 1661 static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b) 1662 { 1663 return (spu_sl(a, spu_and(b, 31))); 1664 } 1665 1666 static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b) 1667 { 1668 return (spu_sl(a, spu_and(b, 31))); 1669 } 1670 1671 1672 /* vec_sld (vector shift left double) 1673 * ======= 1674 */ 1675 #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \ 1676 4+(_c), 5+(_c), 6+(_c), 7+(_c), \ 1677 8+(_c), 9+(_c), 10+(_c), 11+(_c), \ 1678 12+(_c), 13+(_c), 14+(_c), 15+(_c)})) 1679 1680 1681 /* vec_sll (vector shift left long) 1682 * ======= 1683 */ 1684 #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0)) 1685 1686 1687 /* vec_slo (vector shift left by octet) 1688 * ======= 1689 */ 1690 #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F) 1691 1692 1693 /* vec_splat (vector splat) 1694 * ========= 1695 */ 1696 #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b)) 1697 1698 1699 /* vec_splat_s8 (vector splat signed byte) 1700 * ============ 1701 */ 1702 #define vec_splat_s8(_a) spu_splats((signed char)(_a)) 1703 1704 1705 /* vec_splat_s16 (vector splat signed half-word) 1706 * ============= 1707 */ 1708 #define vec_splat_s16(_a) spu_splats((signed short)(_a)) 1709 1710 1711 /* vec_splat_s32 (vector splat signed word) 1712 * ============= 1713 */ 1714 #define vec_splat_s32(_a) spu_splats((signed int)(_a)) 1715 1716 1717 /* vec_splat_u8 (vector splat unsigned byte) 1718 * ============ 1719 */ 1720 #define vec_splat_u8(_a) spu_splats((unsigned char)(_a)) 1721 1722 1723 /* vec_splat_u16 (vector splat unsigned half-word) 1724 * ============= 1725 */ 1726 #define vec_splat_u16(_a) spu_splats((unsigned short)(_a)) 1727 1728 1729 /* vec_splat_u32 (vector splat unsigned word) 1730 * ============= 1731 */ 1732 #define vec_splat_u32(_a) spu_splats((unsigned int)(_a)) 1733 1734 1735 /* vec_sr (vector shift right) 1736 * ====== 1737 */ 1738 static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b) 1739 { 1740 vec_ushort8 hi, lo; 1741 1742 lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))); 1743 hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1744 1745 return ((vec_uchar16)(spu_or(hi, lo))); 1746 } 1747 1748 static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b) 1749 { 1750 return ((vec_char16)(vec_sr((vec_uchar16)(a), b))); 1751 } 1752 1753 static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b) 1754 { 1755 return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1756 } 1757 1758 static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b) 1759 { 1760 return ((vec_short8)(vec_sr((vec_ushort8)(a), b))); 1761 } 1762 1763 static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b) 1764 { 1765 return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1766 } 1767 1768 static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b) 1769 { 1770 return ((vec_int4)(vec_sr((vec_uint4)(a), b))); 1771 } 1772 1773 1774 /* vec_sra (vector shift right algebraic) 1775 * ======= 1776 */ 1777 static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b) 1778 { 1779 vec_short8 hi, lo; 1780 1781 lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF); 1782 hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1783 1784 return ((vec_char16)(spu_or(hi, lo))); 1785 } 1786 1787 static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b) 1788 { 1789 return ((vec_uchar16)(vec_sra((vec_char16)(a), b))); 1790 } 1791 1792 static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b) 1793 { 1794 return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1795 } 1796 1797 static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b) 1798 { 1799 return ((vec_ushort8)(vec_sra((vec_short8)(a), b))); 1800 } 1801 1802 static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b) 1803 { 1804 return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1805 } 1806 1807 static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b) 1808 { 1809 return ((vec_uint4)(vec_sra((vec_int4)(a), b))); 1810 } 1811 1812 1813 /* vec_srl (vector shift right long) 1814 * ======= 1815 */ 1816 #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3)) 1817 1818 1819 /* vec_sro (vector shift right by octet) 1820 * ======= 1821 */ 1822 #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF)) 1823 1824 /* vec_st (vector store indexed) 1825 * ====== 1826 */ 1827 static inline void vec_st(vec_uchar16 a, int b, unsigned char *c) 1828 { 1829 *((vec_uchar16 *)(c+b)) = a; 1830 } 1831 1832 static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c) 1833 { 1834 *((vec_uchar16 *)((unsigned char *)(c)+b)) = a; 1835 } 1836 1837 static inline void vec_st(vec_char16 a, int b, signed char *c) 1838 { 1839 *((vec_char16 *)(c+b)) = a; 1840 } 1841 1842 static inline void vec_st(vec_char16 a, int b, vec_char16 *c) 1843 { 1844 *((vec_char16 *)((signed char *)(c)+b)) = a; 1845 } 1846 1847 static inline void vec_st(vec_bchar16 a, int b, signed char *c) 1848 { 1849 *((vec_bchar16 *)((signed char *)(c)+b)) = a; 1850 } 1851 1852 static inline void vec_st(vec_ushort8 a, int b, unsigned short *c) 1853 { 1854 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1855 } 1856 1857 static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c) 1858 { 1859 *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1860 } 1861 1862 static inline void vec_st(vec_short8 a, int b, signed short *c) 1863 { 1864 *((vec_short8 *)((unsigned char *)(c)+b)) = a; 1865 } 1866 1867 static inline void vec_st(vec_short8 a, int b, vec_short8 *c) 1868 { 1869 *((vec_short8 *)((signed char *)(c)+b)) = a; 1870 } 1871 1872 static inline void vec_st(vec_bshort8 a, int b, signed short *c) 1873 { 1874 *((vec_bshort8 *)((signed char *)(c)+b)) = a; 1875 } 1876 1877 static inline void vec_st(vec_uint4 a, int b, unsigned int *c) 1878 { 1879 *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1880 } 1881 1882 static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c) 1883 { 1884 *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1885 } 1886 1887 static inline void vec_st(vec_int4 a, int b, signed int *c) 1888 { 1889 *((vec_int4 *)((unsigned char *)(c)+b)) = a; 1890 } 1891 1892 static inline void vec_st(vec_int4 a, int b, vec_int4 *c) 1893 { 1894 *((vec_int4 *)((signed char *)(c)+b)) = a; 1895 } 1896 1897 static inline void vec_st(vec_bint4 a, int b, signed int *c) 1898 { 1899 *((vec_bint4 *)((signed char *)(c)+b)) = a; 1900 } 1901 1902 static inline void vec_st(vec_float4 a, int b, float *c) 1903 { 1904 *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1905 } 1906 1907 static inline void vec_st(vec_float4 a, int b, vec_float4 *c) 1908 { 1909 *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1910 } 1911 1912 1913 /* vec_ste (vector store element indexed) 1914 * ======= 1915 */ 1916 static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c) 1917 { 1918 unsigned char *ptr; 1919 1920 ptr = c + b; 1921 *ptr = spu_extract(a, (int)(ptr) & 15); 1922 } 1923 1924 static inline void vec_ste(vec_char16 a, int b, signed char *c) 1925 { 1926 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1927 } 1928 1929 static inline void vec_ste(vec_bchar16 a, int b, signed char *c) 1930 { 1931 vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1932 } 1933 1934 static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c) 1935 { 1936 unsigned short *ptr; 1937 1938 ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1); 1939 *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7); 1940 } 1941 1942 static inline void vec_ste(vec_short8 a, int b, signed short *c) 1943 { 1944 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1945 } 1946 1947 static inline void vec_ste(vec_bshort8 a, int b, signed short *c) 1948 { 1949 vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1950 } 1951 1952 static inline void vec_ste(vec_uint4 a, int b, unsigned int *c) 1953 { 1954 unsigned int *ptr; 1955 1956 ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3); 1957 *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3); 1958 } 1959 1960 static inline void vec_ste(vec_int4 a, int b, signed int *c) 1961 { 1962 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1963 } 1964 1965 static inline void vec_ste(vec_bint4 a, int b, signed int *c) 1966 { 1967 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1968 } 1969 1970 static inline void vec_ste(vec_float4 a, int b, float *c) 1971 { 1972 vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1973 } 1974 1975 1976 /* vec_stl (vector store indexed LRU) 1977 * ======= 1978 */ 1979 #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c) 1980 1981 1982 /* vec_sub (vector subtract) 1983 * ======= 1984 */ 1985 static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b) 1986 { 1987 return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)), 1988 spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)), 1989 spu_splats((unsigned short)0xFF00)))); 1990 } 1991 1992 static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b) 1993 { 1994 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 1995 } 1996 1997 static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b) 1998 { 1999 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2000 } 2001 2002 static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b) 2003 { 2004 return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2005 } 2006 2007 static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b) 2008 { 2009 return (spu_sub(a, b)); 2010 } 2011 2012 static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b) 2013 { 2014 return (spu_sub(a, b)); 2015 } 2016 2017 static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b) 2018 { 2019 return (spu_sub((vec_short8)(a), b)); 2020 } 2021 2022 static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b) 2023 { 2024 return (spu_sub(a, (vec_short8)(b))); 2025 } 2026 2027 static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b) 2028 { 2029 return (spu_sub(a, b)); 2030 } 2031 2032 static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b) 2033 { 2034 return (spu_sub(a, b)); 2035 } 2036 2037 static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b) 2038 { 2039 return (spu_sub((vec_int4)(a), b)); 2040 } 2041 2042 static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b) 2043 { 2044 return (spu_sub(a, (vec_int4)(b))); 2045 } 2046 2047 static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b) 2048 { 2049 return (spu_sub(a, b)); 2050 } 2051 2052 2053 /* vec_subc (vector subtract carryout) 2054 * ======== 2055 */ 2056 #define vec_subc(_a, _b) spu_genb(_a, _b) 2057 2058 2059 /* vec_subs (vector subtract saturate) 2060 * ======== 2061 */ 2062 static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b) 2063 { 2064 vec_ushort8 s1, s2; 2065 vec_uchar16 s, d; 2066 2067 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2068 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2069 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 2070 8, 24, 10, 26, 12, 28, 14, 30}))); 2071 d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2072 9, 25, 11, 27, 13, 29, 15, 31}))); 2073 return (spu_andc(d, s)); 2074 } 2075 2076 static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b) 2077 { 2078 vec_ushort8 s1, s2; 2079 vec_uchar16 s, d; 2080 2081 s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2082 s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2083 s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2084 9, 25, 11, 27, 13, 29, 15, 31}))); 2085 d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F)); 2086 d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F)); 2087 2088 return ((vec_char16)(d)); 2089 } 2090 2091 static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b) 2092 { 2093 return (vec_subs((vec_char16)(a), b)); 2094 } 2095 2096 static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b) 2097 { 2098 return (vec_subs(a, (vec_char16)(b))); 2099 } 2100 2101 static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b) 2102 { 2103 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2104 } 2105 2106 static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b) 2107 { 2108 vec_short8 s; 2109 vec_short8 d; 2110 2111 s = spu_sub(a, b); 2112 d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15))); 2113 d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15))); 2114 2115 return (d); 2116 } 2117 2118 static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b) 2119 { 2120 return ((vec_short8)(vec_subs((vec_short8)(a), b))); 2121 } 2122 2123 static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b) 2124 { 2125 return ((vec_short8)(vec_subs(a, (vec_short8)(b)))); 2126 } 2127 2128 static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b) 2129 { 2130 return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2131 } 2132 2133 static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b) 2134 { 2135 vec_int4 s; 2136 vec_int4 d; 2137 2138 s = spu_sub(a, b); 2139 d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31))); 2140 d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31))); 2141 2142 return (d); 2143 } 2144 2145 static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b) 2146 { 2147 return ((vec_int4)(vec_subs((vec_int4)(a), b))); 2148 } 2149 2150 static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b) 2151 { 2152 return ((vec_int4)(vec_subs(a, (vec_int4)(b)))); 2153 } 2154 2155 2156 /* vec_sum4s (vector sum across partial (1/4) saturated) 2157 * ========= 2158 */ 2159 static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b) 2160 { 2161 vec_uint4 a01_23, a0123; 2162 2163 a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8), 2164 spu_and((vec_ushort8)(a), 0xFF))); 2165 a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF)); 2166 return (vec_adds(a0123, b)); 2167 } 2168 2169 static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b) 2170 { 2171 vec_int4 a01_23, a0123; 2172 2173 a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8), 2174 spu_extend(a))); 2175 a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23))); 2176 return (vec_adds(a0123, b)); 2177 } 2178 2179 static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b) 2180 { 2181 vec_int4 a0123; 2182 2183 a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a)); 2184 return (vec_adds(a0123, b)); 2185 } 2186 2187 2188 /* vec_sum2s (vector sum across partial (1/2) saturated) 2189 * ========= 2190 */ 2191 static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b) 2192 { 2193 vec_int4 c, d; 2194 vec_int4 sign1, sign2, sign3; 2195 vec_int4 carry, sum_l, sum_h, sat, sat_val; 2196 2197 sign1 = spu_rlmaska(a, -31); 2198 sign2 = spu_rlmaska(b, -31); 2199 2200 c = spu_rlqwbyte(a, -4); 2201 sign3 = spu_rlqwbyte(sign1, -4); 2202 2203 carry = spu_genc(a, b); 2204 sum_l = spu_add(a, b); 2205 sum_h = spu_addx(sign1, sign2, carry); 2206 2207 carry = spu_genc(sum_l, c); 2208 sum_l = spu_add(sum_l, c); 2209 sum_h = spu_addx(sum_h, sign3, carry); 2210 2211 sign1 = spu_rlmaska(sum_l, -31); 2212 sign2 = spu_rlmaska(sum_h, -31); 2213 2214 sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF)); 2215 2216 sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2)); 2217 2218 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1}); 2219 2220 return (d); 2221 } 2222 2223 2224 /* vec_sums (vector sum saturated) 2225 * ======== 2226 */ 2227 static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b) 2228 { 2229 vec_int4 a0, a1, a2, c0, c1, c2, d; 2230 vec_int4 sign_a, sign_b, sign_l, sign_h; 2231 vec_int4 sum_l, sum_h, sat, sat_val; 2232 2233 sign_a = spu_rlmaska(a, -31); 2234 sign_b = spu_rlmaska(b, -31); 2235 2236 a0 = spu_rlqwbyte(a, -12); 2237 a1 = spu_rlqwbyte(a, -8); 2238 a2 = spu_rlqwbyte(a, -4); 2239 2240 sum_l = spu_add(a, b); 2241 sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b)); 2242 2243 c2 = spu_genc(sum_l, a2); 2244 sum_l = spu_add(sum_l, a2); 2245 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2); 2246 2247 c1 = spu_genc(sum_l, a1); 2248 sum_l = spu_add(sum_l, a1); 2249 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1); 2250 2251 c0 = spu_genc(sum_l, a0); 2252 sum_l = spu_add(sum_l, a0); 2253 sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0); 2254 2255 sign_l = spu_rlmaska(sum_l, -31); 2256 sign_h = spu_rlmaska(sum_h, -31); 2257 2258 sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF)); 2259 2260 sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h)); 2261 2262 d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1})); 2263 2264 return (d); 2265 } 2266 2267 2268 /* vec_trunc (vector truncate) 2269 * ========= 2270 */ 2271 static inline vec_float4 vec_trunc(vec_float4 a) 2272 { 2273 vec_int4 exp; 2274 vec_uint4 mask; 2275 2276 exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 2277 mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 2278 mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 2279 mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 2280 return (spu_andc(a, (vec_float4)(mask))); 2281 } 2282 2283 /* vec_unpackh (vector unpack high element) 2284 * =========== 2285 */ 2286 static inline vec_short8 vec_unpackh(vec_char16 a) 2287 { 2288 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3, 2289 4, 4, 5, 5, 6, 6, 7, 7})))); 2290 } 2291 2292 static inline vec_bshort8 vec_unpackh(vec_bchar16 a) 2293 { 2294 return ((vec_bshort8)(vec_unpackh((vec_char16)(a)))); 2295 } 2296 2297 static inline vec_int4 vec_unpackh(vec_short8 a) 2298 { 2299 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3, 2300 0, 0, 4, 5, 0, 0, 6, 7})))); 2301 } 2302 2303 #ifdef SUPPORT_UNPACK_PIXEL 2304 /* Due to type conflicts, unpacking of pixel types and boolean shorts 2305 * cannot simultaneously be supported. By default, the boolean short is 2306 * supported. 2307 */ 2308 static inline vec_uint4 vec_unpackh(vec_pixel8 a) 2309 { 2310 vec_ushort8 p1, p2; 2311 2312 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)), 2313 spu_and((vec_ushort8)(a.p), 0x1F), 2314 ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19, 2315 4, 128, 128, 21, 6, 128, 128, 23})); 2316 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F), 2317 spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F), 2318 ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128, 2319 128, 21, 5, 128, 128, 23, 7, 128})); 2320 return ((vec_uint4)(spu_or(p1, p2))); 2321 } 2322 2323 #else 2324 2325 static inline vec_bint4 vec_unpackh(vec_bshort8 a) 2326 { 2327 return ((vec_bint4)(vec_unpackh((vec_short8)(a)))); 2328 } 2329 #endif 2330 2331 2332 2333 2334 2335 /* vec_unpackl (vector unpack low element) 2336 * =========== 2337 */ 2338 static inline vec_short8 vec_unpackl(vec_char16 a) 2339 { 2340 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11, 2341 12, 12, 13, 13, 14, 14, 15, 15})))); 2342 } 2343 2344 static inline vec_bshort8 vec_unpackl(vec_bchar16 a) 2345 { 2346 return ((vec_bshort8)(vec_unpackl((vec_char16)(a)))); 2347 } 2348 2349 2350 static inline vec_int4 vec_unpackl(vec_short8 a) 2351 { 2352 return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11, 2353 0, 0,12,13, 0, 0, 14, 15})))); 2354 } 2355 2356 2357 #ifdef SUPPORT_UNPACK_PIXEL 2358 /* Due to type conflicts, unpacking of pixel types and boolean shorts 2359 * cannot simultaneously be supported. By default, the boolean short is 2360 * supported. 2361 */ 2362 static inline vec_uint4 vec_unpackl(vec_pixel8 a) 2363 { 2364 vec_ushort8 p1, p2; 2365 2366 p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)), 2367 spu_and((vec_ushort8)(a), 0x1F), 2368 ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27, 2369 12, 128, 128, 29, 14, 128, 128, 31})); 2370 p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F), 2371 spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F), 2372 ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128, 2373 128, 29, 13, 128, 128, 31, 15, 128})); 2374 return ((vec_uint4)(spu_or(p1, p2))); 2375 } 2376 2377 #else 2378 2379 static inline vec_bint4 vec_unpackl(vec_bshort8 a) 2380 { 2381 return ((vec_bint4)(vec_unpackl((vec_short8)(a)))); 2382 2383 } 2384 #endif 2385 2386 2387 2388 /* vec_xor (vector logical xor) 2389 * ====== 2390 */ 2391 static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b) 2392 { 2393 return (spu_xor(a, b)); 2394 } 2395 2396 static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b) 2397 { 2398 return (spu_xor(a, b)); 2399 } 2400 2401 static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b) 2402 { 2403 return (spu_xor((vec_char16)(a), b)); 2404 } 2405 2406 static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b) 2407 { 2408 return (spu_xor(a, (vec_char16)(b))); 2409 } 2410 2411 static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b) 2412 { 2413 return (spu_xor(a, b)); 2414 } 2415 2416 static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b) 2417 { 2418 return (spu_xor(a, b)); 2419 } 2420 2421 static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b) 2422 { 2423 return (spu_xor((vec_short8)(a), b)); 2424 } 2425 2426 static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b) 2427 { 2428 return (spu_xor(a, (vec_short8)(b))); 2429 } 2430 2431 static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b) 2432 { 2433 return (spu_xor(a, b)); 2434 } 2435 2436 static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b) 2437 { 2438 return (spu_xor(a, b)); 2439 } 2440 2441 static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b) 2442 { 2443 return (spu_xor((vec_int4)(a), b)); 2444 } 2445 2446 static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b) 2447 { 2448 return (spu_xor(a, (vec_int4)(b))); 2449 } 2450 2451 static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b) 2452 { 2453 return (spu_xor(a, b)); 2454 } 2455 2456 static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b) 2457 { 2458 return (spu_xor((vec_float4)(a),b)); 2459 } 2460 2461 static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b) 2462 { 2463 return (spu_xor(a, (vec_float4)(b))); 2464 } 2465 2466 /************************************************************************ 2467 * PREDICATES 2468 ************************************************************************/ 2469 2470 /* vec_all_eq (all elements equal) 2471 * ========== 2472 */ 2473 static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b) 2474 { 2475 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2476 } 2477 2478 static inline int vec_all_eq(vec_char16 a, vec_char16 b) 2479 { 2480 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2481 } 2482 2483 static inline int vec_all_eq(vec_bchar16 a, vec_char16 b) 2484 { 2485 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF)); 2486 } 2487 2488 static inline int vec_all_eq(vec_char16 a, vec_bchar16 b) 2489 { 2490 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF)); 2491 } 2492 2493 static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b) 2494 { 2495 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2496 } 2497 2498 static inline int vec_all_eq(vec_short8 a, vec_short8 b) 2499 { 2500 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2501 } 2502 2503 static inline int vec_all_eq(vec_bshort8 a, vec_short8 b) 2504 { 2505 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF)); 2506 } 2507 2508 static inline int vec_all_eq(vec_short8 a, vec_bshort8 b) 2509 { 2510 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF)); 2511 } 2512 2513 static inline int vec_all_eq(vec_uint4 a, vec_uint4 b) 2514 { 2515 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2516 } 2517 2518 static inline int vec_all_eq(vec_int4 a, vec_int4 b) 2519 { 2520 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2521 } 2522 2523 static inline int vec_all_eq(vec_bint4 a, vec_int4 b) 2524 { 2525 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF)); 2526 } 2527 2528 static inline int vec_all_eq(vec_int4 a, vec_bint4 b) 2529 { 2530 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF)); 2531 } 2532 2533 static inline int vec_all_eq(vec_float4 a, vec_float4 b) 2534 { 2535 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2536 } 2537 2538 2539 /* vec_all_ge (all elements greater than or equal) 2540 * ========== 2541 */ 2542 static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b) 2543 { 2544 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2545 } 2546 2547 static inline int vec_all_ge(vec_char16 a, vec_char16 b) 2548 { 2549 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2550 } 2551 2552 static inline int vec_all_ge(vec_bchar16 a, vec_char16 b) 2553 { 2554 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0)); 2555 } 2556 2557 static inline int vec_all_ge(vec_char16 a, vec_bchar16 b) 2558 { 2559 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0)); 2560 } 2561 2562 static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b) 2563 { 2564 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2565 } 2566 2567 static inline int vec_all_ge(vec_short8 a, vec_short8 b) 2568 { 2569 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2570 } 2571 2572 static inline int vec_all_ge(vec_bshort8 a, vec_short8 b) 2573 { 2574 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0)); 2575 } 2576 2577 static inline int vec_all_ge(vec_short8 a, vec_bshort8 b) 2578 { 2579 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0)); 2580 } 2581 2582 static inline int vec_all_ge(vec_uint4 a, vec_uint4 b) 2583 { 2584 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2585 } 2586 2587 static inline int vec_all_ge(vec_int4 a, vec_int4 b) 2588 { 2589 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2590 } 2591 2592 static inline int vec_all_ge(vec_bint4 a, vec_int4 b) 2593 { 2594 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0)); 2595 } 2596 2597 static inline int vec_all_ge(vec_int4 a, vec_bint4 b) 2598 { 2599 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0)); 2600 } 2601 2602 static inline int vec_all_ge(vec_float4 a, vec_float4 b) 2603 { 2604 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2605 } 2606 2607 2608 /* vec_all_gt (all elements greater than) 2609 * ========== 2610 */ 2611 static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b) 2612 { 2613 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2614 } 2615 2616 static inline int vec_all_gt(vec_char16 a, vec_char16 b) 2617 { 2618 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2619 } 2620 2621 static inline int vec_all_gt(vec_bchar16 a, vec_char16 b) 2622 { 2623 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF)); 2624 } 2625 2626 static inline int vec_all_gt(vec_char16 a, vec_bchar16 b) 2627 { 2628 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF)); 2629 } 2630 2631 static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b) 2632 { 2633 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2634 } 2635 2636 static inline int vec_all_gt(vec_short8 a, vec_short8 b) 2637 { 2638 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2639 } 2640 2641 static inline int vec_all_gt(vec_bshort8 a, vec_short8 b) 2642 { 2643 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF)); 2644 } 2645 2646 static inline int vec_all_gt(vec_short8 a, vec_bshort8 b) 2647 { 2648 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF)); 2649 } 2650 2651 static inline int vec_all_gt(vec_uint4 a, vec_uint4 b) 2652 { 2653 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2654 } 2655 2656 static inline int vec_all_gt(vec_int4 a, vec_int4 b) 2657 { 2658 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2659 } 2660 2661 static inline int vec_all_gt(vec_bint4 a, vec_int4 b) 2662 { 2663 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF)); 2664 } 2665 2666 static inline int vec_all_gt(vec_int4 a, vec_bint4 b) 2667 { 2668 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF)); 2669 } 2670 2671 static inline int vec_all_gt(vec_float4 a, vec_float4 b) 2672 { 2673 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2674 } 2675 2676 2677 /* vec_all_in (all elements in bounds) 2678 * ========== 2679 */ 2680 static inline int vec_all_in(vec_float4 a, vec_float4 b) 2681 { 2682 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF); 2683 } 2684 2685 2686 /* vec_all_le (all elements less than or equal) 2687 * ========== 2688 */ 2689 static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b) 2690 { 2691 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2692 } 2693 2694 static inline int vec_all_le(vec_char16 a, vec_char16 b) 2695 { 2696 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2697 } 2698 2699 static inline int vec_all_le(vec_bchar16 a, vec_char16 b) 2700 { 2701 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0)); 2702 } 2703 2704 static inline int vec_all_le(vec_char16 a, vec_bchar16 b) 2705 { 2706 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0)); 2707 } 2708 2709 static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b) 2710 { 2711 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2712 } 2713 2714 static inline int vec_all_le(vec_short8 a, vec_short8 b) 2715 { 2716 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2717 } 2718 2719 static inline int vec_all_le(vec_bshort8 a, vec_short8 b) 2720 { 2721 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0)); 2722 } 2723 2724 static inline int vec_all_le(vec_short8 a, vec_bshort8 b) 2725 { 2726 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0)); 2727 } 2728 2729 static inline int vec_all_le(vec_uint4 a, vec_uint4 b) 2730 { 2731 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2732 } 2733 2734 static inline int vec_all_le(vec_int4 a, vec_int4 b) 2735 { 2736 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2737 } 2738 2739 static inline int vec_all_le(vec_bint4 a, vec_int4 b) 2740 { 2741 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0)); 2742 } 2743 2744 static inline int vec_all_le(vec_int4 a, vec_bint4 b) 2745 { 2746 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0)); 2747 } 2748 2749 static inline int vec_all_le(vec_float4 a, vec_float4 b) 2750 { 2751 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2752 } 2753 2754 2755 /* vec_all_lt (all elements less than) 2756 * ========== 2757 */ 2758 static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b) 2759 { 2760 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2761 } 2762 2763 static inline int vec_all_lt(vec_char16 a, vec_char16 b) 2764 { 2765 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2766 } 2767 2768 static inline int vec_all_lt(vec_bchar16 a, vec_char16 b) 2769 { 2770 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF)); 2771 } 2772 2773 static inline int vec_all_lt(vec_char16 a, vec_bchar16 b) 2774 { 2775 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF)); 2776 } 2777 2778 static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b) 2779 { 2780 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2781 } 2782 2783 static inline int vec_all_lt(vec_short8 a, vec_short8 b) 2784 { 2785 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2786 } 2787 2788 static inline int vec_all_lt(vec_bshort8 a, vec_short8 b) 2789 { 2790 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF)); 2791 } 2792 2793 static inline int vec_all_lt(vec_short8 a, vec_bshort8 b) 2794 { 2795 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF)); 2796 } 2797 2798 static inline int vec_all_lt(vec_uint4 a, vec_uint4 b) 2799 { 2800 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2801 } 2802 2803 static inline int vec_all_lt(vec_int4 a, vec_int4 b) 2804 { 2805 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2806 } 2807 2808 static inline int vec_all_lt(vec_bint4 a, vec_int4 b) 2809 { 2810 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF)); 2811 } 2812 2813 static inline int vec_all_lt(vec_int4 a, vec_bint4 b) 2814 { 2815 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF)); 2816 } 2817 2818 static inline int vec_all_lt(vec_float4 a, vec_float4 b) 2819 { 2820 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2821 } 2822 2823 2824 /* vec_all_nan (all elements not a number) 2825 * =========== 2826 */ 2827 static inline int vec_all_nan(vec_float4 a) 2828 { 2829 vec_uint4 exp, man; 2830 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 2831 2832 exp = spu_and((vec_uint4)(a), exp_mask); 2833 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 2834 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 2835 spu_cmpeq(man, 0))), 0) == 0xF)); 2836 } 2837 2838 #define vec_all_nan(_a) (0) 2839 2840 2841 /* vec_all_ne (all elements not equal) 2842 * ========== 2843 */ 2844 static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b) 2845 { 2846 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2847 } 2848 2849 static inline int vec_all_ne(vec_char16 a, vec_char16 b) 2850 { 2851 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2852 } 2853 2854 static inline int vec_all_ne(vec_bchar16 a, vec_char16 b) 2855 { 2856 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0)); 2857 } 2858 2859 static inline int vec_all_ne(vec_char16 a, vec_bchar16 b) 2860 { 2861 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0)); 2862 } 2863 2864 static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b) 2865 { 2866 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2867 } 2868 2869 static inline int vec_all_ne(vec_short8 a, vec_short8 b) 2870 { 2871 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2872 } 2873 2874 static inline int vec_all_ne(vec_bshort8 a, vec_short8 b) 2875 { 2876 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0)); 2877 } 2878 2879 static inline int vec_all_ne(vec_short8 a, vec_bshort8 b) 2880 { 2881 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0)); 2882 } 2883 2884 static inline int vec_all_ne(vec_uint4 a, vec_uint4 b) 2885 { 2886 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2887 } 2888 2889 static inline int vec_all_ne(vec_int4 a, vec_int4 b) 2890 { 2891 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2892 } 2893 2894 static inline int vec_all_ne(vec_bint4 a, vec_int4 b) 2895 { 2896 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0)); 2897 } 2898 2899 static inline int vec_all_ne(vec_int4 a, vec_bint4 b) 2900 { 2901 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0)); 2902 } 2903 2904 static inline int vec_all_ne(vec_float4 a, vec_float4 b) 2905 { 2906 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2907 } 2908 2909 2910 /* vec_all_nge (all elements not greater than or equal) 2911 * =========== 2912 */ 2913 static inline int vec_all_nge(vec_float4 a, vec_float4 b) 2914 { 2915 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2916 } 2917 2918 2919 /* vec_all_ngt (all elements not greater than) 2920 * =========== 2921 */ 2922 static inline int vec_all_ngt(vec_float4 a, vec_float4 b) 2923 { 2924 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2925 } 2926 2927 2928 /* vec_all_nle (all elements not less than or equal) 2929 * =========== 2930 */ 2931 static inline int vec_all_nle(vec_float4 a, vec_float4 b) 2932 { 2933 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2934 } 2935 2936 2937 /* vec_all_nlt (all elements not less than) 2938 * =========== 2939 */ 2940 static inline int vec_all_nlt(vec_float4 a, vec_float4 b) 2941 { 2942 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2943 } 2944 2945 2946 /* vec_all_numeric (all elements numeric) 2947 * =========== 2948 */ 2949 static inline int vec_all_numeric(vec_float4 a) 2950 { 2951 vec_uint4 exp; 2952 2953 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 2954 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0)); 2955 } 2956 2957 2958 2959 /* vec_any_eq (any elements equal) 2960 * ========== 2961 */ 2962 static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b) 2963 { 2964 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2965 } 2966 2967 static inline int vec_any_eq(vec_char16 a, vec_char16 b) 2968 { 2969 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2970 } 2971 2972 static inline int vec_any_eq(vec_bchar16 a, vec_char16 b) 2973 { 2974 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0)); 2975 } 2976 2977 static inline int vec_any_eq(vec_char16 a, vec_bchar16 b) 2978 { 2979 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0)); 2980 } 2981 2982 static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b) 2983 { 2984 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2985 } 2986 2987 static inline int vec_any_eq(vec_short8 a, vec_short8 b) 2988 { 2989 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2990 } 2991 2992 static inline int vec_any_eq(vec_bshort8 a, vec_short8 b) 2993 { 2994 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0)); 2995 } 2996 2997 static inline int vec_any_eq(vec_short8 a, vec_bshort8 b) 2998 { 2999 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0)); 3000 } 3001 3002 static inline int vec_any_eq(vec_uint4 a, vec_uint4 b) 3003 { 3004 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3005 } 3006 3007 static inline int vec_any_eq(vec_int4 a, vec_int4 b) 3008 { 3009 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3010 } 3011 3012 static inline int vec_any_eq(vec_bint4 a, vec_int4 b) 3013 { 3014 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0))); 3015 } 3016 3017 static inline int vec_any_eq(vec_int4 a, vec_bint4 b) 3018 { 3019 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0))); 3020 } 3021 3022 static inline int vec_any_eq(vec_float4 a, vec_float4 b) 3023 { 3024 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3025 } 3026 3027 /* vec_any_ge (any elements greater than or equal) 3028 * ========== 3029 */ 3030 static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b) 3031 { 3032 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3033 } 3034 3035 static inline int vec_any_ge(vec_char16 a, vec_char16 b) 3036 { 3037 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3038 } 3039 3040 static inline int vec_any_ge(vec_bchar16 a, vec_char16 b) 3041 { 3042 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF)); 3043 } 3044 3045 static inline int vec_any_ge(vec_char16 a, vec_bchar16 b) 3046 { 3047 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF)); 3048 } 3049 3050 static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b) 3051 { 3052 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3053 } 3054 3055 static inline int vec_any_ge(vec_short8 a, vec_short8 b) 3056 { 3057 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3058 } 3059 3060 static inline int vec_any_ge(vec_bshort8 a, vec_short8 b) 3061 { 3062 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF)); 3063 } 3064 3065 static inline int vec_any_ge(vec_short8 a, vec_bshort8 b) 3066 { 3067 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF)); 3068 } 3069 3070 static inline int vec_any_ge(vec_uint4 a, vec_uint4 b) 3071 { 3072 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3073 } 3074 3075 static inline int vec_any_ge(vec_int4 a, vec_int4 b) 3076 { 3077 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3078 } 3079 3080 static inline int vec_any_ge(vec_bint4 a, vec_int4 b) 3081 { 3082 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF)); 3083 } 3084 3085 static inline int vec_any_ge(vec_int4 a, vec_bint4 b) 3086 { 3087 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF)); 3088 } 3089 3090 static inline int vec_any_ge(vec_float4 a, vec_float4 b) 3091 { 3092 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3093 } 3094 3095 3096 /* vec_any_gt (any elements greater than) 3097 * ========== 3098 */ 3099 static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b) 3100 { 3101 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3102 } 3103 3104 static inline int vec_any_gt(vec_char16 a, vec_char16 b) 3105 { 3106 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3107 } 3108 3109 static inline int vec_any_gt(vec_bchar16 a, vec_char16 b) 3110 { 3111 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0)); 3112 } 3113 3114 static inline int vec_any_gt(vec_char16 a, vec_bchar16 b) 3115 { 3116 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0)); 3117 } 3118 3119 static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b) 3120 { 3121 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3122 } 3123 3124 static inline int vec_any_gt(vec_short8 a, vec_short8 b) 3125 { 3126 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3127 } 3128 3129 static inline int vec_any_gt(vec_bshort8 a, vec_short8 b) 3130 { 3131 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0)); 3132 } 3133 3134 static inline int vec_any_gt(vec_short8 a, vec_bshort8 b) 3135 { 3136 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0)); 3137 } 3138 3139 3140 static inline int vec_any_gt(vec_uint4 a, vec_uint4 b) 3141 { 3142 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3143 } 3144 3145 static inline int vec_any_gt(vec_int4 a, vec_int4 b) 3146 { 3147 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3148 } 3149 3150 static inline int vec_any_gt(vec_bint4 a, vec_int4 b) 3151 { 3152 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0))); 3153 } 3154 3155 static inline int vec_any_gt(vec_int4 a, vec_bint4 b) 3156 { 3157 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0))); 3158 } 3159 3160 static inline int vec_any_gt(vec_float4 a, vec_float4 b) 3161 { 3162 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3163 } 3164 3165 /* vec_any_le (any elements less than or equal) 3166 * ========== 3167 */ 3168 static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b) 3169 { 3170 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3171 } 3172 3173 static inline int vec_any_le(vec_char16 a, vec_char16 b) 3174 { 3175 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3176 } 3177 3178 static inline int vec_any_le(vec_bchar16 a, vec_char16 b) 3179 { 3180 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF)); 3181 } 3182 3183 static inline int vec_any_le(vec_char16 a, vec_bchar16 b) 3184 { 3185 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF)); 3186 } 3187 3188 static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b) 3189 { 3190 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3191 } 3192 3193 static inline int vec_any_le(vec_short8 a, vec_short8 b) 3194 { 3195 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3196 } 3197 3198 static inline int vec_any_le(vec_bshort8 a, vec_short8 b) 3199 { 3200 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF)); 3201 } 3202 3203 static inline int vec_any_le(vec_short8 a, vec_bshort8 b) 3204 { 3205 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF)); 3206 } 3207 3208 static inline int vec_any_le(vec_uint4 a, vec_uint4 b) 3209 { 3210 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3211 } 3212 3213 static inline int vec_any_le(vec_int4 a, vec_int4 b) 3214 { 3215 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3216 } 3217 3218 static inline int vec_any_le(vec_bint4 a, vec_int4 b) 3219 { 3220 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF)); 3221 } 3222 3223 static inline int vec_any_le(vec_int4 a, vec_bint4 b) 3224 { 3225 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF)); 3226 } 3227 3228 static inline int vec_any_le(vec_float4 a, vec_float4 b) 3229 { 3230 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3231 } 3232 3233 3234 /* vec_any_lt (any elements less than) 3235 * ========== 3236 */ 3237 static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b) 3238 { 3239 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3240 } 3241 3242 static inline int vec_any_lt(vec_char16 a, vec_char16 b) 3243 { 3244 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3245 } 3246 3247 static inline int vec_any_lt(vec_bchar16 a, vec_char16 b) 3248 { 3249 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0)); 3250 } 3251 3252 static inline int vec_any_lt(vec_char16 a, vec_bchar16 b) 3253 { 3254 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0)); 3255 } 3256 3257 static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b) 3258 { 3259 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3260 } 3261 3262 static inline int vec_any_lt(vec_short8 a, vec_short8 b) 3263 { 3264 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3265 } 3266 3267 static inline int vec_any_lt(vec_bshort8 a, vec_short8 b) 3268 { 3269 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0)); 3270 } 3271 3272 static inline int vec_any_lt(vec_short8 a, vec_bshort8 b) 3273 { 3274 return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0)); 3275 } 3276 3277 static inline int vec_any_lt(vec_uint4 a, vec_uint4 b) 3278 { 3279 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3280 } 3281 3282 static inline int vec_any_lt(vec_int4 a, vec_int4 b) 3283 { 3284 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3285 } 3286 3287 static inline int vec_any_lt(vec_bint4 a, vec_int4 b) 3288 { 3289 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0))); 3290 } 3291 3292 static inline int vec_any_lt(vec_int4 a, vec_bint4 b) 3293 { 3294 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0))); 3295 } 3296 3297 static inline int vec_any_lt(vec_float4 a, vec_float4 b) 3298 { 3299 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3300 } 3301 3302 /* vec_any_nan (any elements not a number) 3303 * =========== 3304 */ 3305 static inline int vec_any_nan(vec_float4 a) 3306 { 3307 vec_uint4 exp, man; 3308 vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 3309 3310 exp = spu_and((vec_uint4)(a), exp_mask); 3311 man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 3312 return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 3313 spu_cmpeq(man, 0))), 0) != 0)); 3314 } 3315 3316 3317 /* vec_any_ne (any elements not equal) 3318 * ========== 3319 */ 3320 static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b) 3321 { 3322 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3323 } 3324 3325 static inline int vec_any_ne(vec_char16 a, vec_char16 b) 3326 { 3327 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3328 } 3329 3330 static inline int vec_any_ne(vec_bchar16 a, vec_char16 b) 3331 { 3332 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF)); 3333 } 3334 3335 static inline int vec_any_ne(vec_char16 a, vec_bchar16 b) 3336 { 3337 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF)); 3338 } 3339 3340 static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b) 3341 { 3342 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3343 } 3344 3345 static inline int vec_any_ne(vec_short8 a, vec_short8 b) 3346 { 3347 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3348 } 3349 3350 static inline int vec_any_ne(vec_bshort8 a, vec_short8 b) 3351 { 3352 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF)); 3353 } 3354 3355 static inline int vec_any_ne(vec_short8 a, vec_bshort8 b) 3356 { 3357 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF)); 3358 } 3359 3360 static inline int vec_any_ne(vec_uint4 a, vec_uint4 b) 3361 { 3362 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3363 } 3364 3365 static inline int vec_any_ne(vec_int4 a, vec_int4 b) 3366 { 3367 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3368 } 3369 3370 static inline int vec_any_ne(vec_bint4 a, vec_int4 b) 3371 { 3372 return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF)); 3373 } 3374 3375 static inline int vec_any_ne(vec_int4 a, vec_bint4 b) 3376 { 3377 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF)); 3378 } 3379 3380 static inline int vec_any_ne(vec_float4 a, vec_float4 b) 3381 { 3382 return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3383 } 3384 3385 3386 /* vec_any_nge (any elements not greater than or equal) 3387 * =========== 3388 */ 3389 static inline int vec_any_nge(vec_float4 a, vec_float4 b) 3390 { 3391 return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3392 } 3393 3394 /* vec_any_ngt (any elements not greater than) 3395 * =========== 3396 */ 3397 static inline int vec_any_ngt(vec_float4 a, vec_float4 b) 3398 { 3399 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3400 } 3401 3402 3403 /* vec_any_nle (any elements not less than or equal) 3404 * =========== 3405 */ 3406 static inline int vec_any_nle(vec_float4 a, vec_float4 b) 3407 { 3408 return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3409 } 3410 3411 3412 /* vec_any_nlt (any elements not less than) 3413 * =========== 3414 */ 3415 static inline int vec_any_nlt(vec_float4 a, vec_float4 b) 3416 { 3417 return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3418 } 3419 3420 3421 /* vec_any_numeric (any elements numeric) 3422 * =============== 3423 */ 3424 static inline int vec_any_numeric(vec_float4 a) 3425 { 3426 vec_uint4 exp; 3427 3428 exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 3429 return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF)); 3430 } 3431 3432 3433 /* vec_any_out (any elements out of bounds) 3434 * =========== 3435 */ 3436 static inline int vec_any_out(vec_float4 a, vec_float4 b) 3437 { 3438 return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF); 3439 } 3440 3441 3442 /* CBE Language Extension Intrinsics 3443 */ 3444 3445 /* vec_extract (extract element from vector) 3446 * =========== 3447 */ 3448 #define vec_extract(_a, _element) spu_extract(_a, _element) 3449 3450 3451 /* vec_insert (insert scalar into specified vector element) 3452 * ========== 3453 */ 3454 #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element) 3455 3456 /* vec_lvlx (load vector left indexed) 3457 * ======== 3458 */ 3459 static inline vec_uchar16 vec_lvlx(int a, unsigned char *b) 3460 { 3461 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3462 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3463 } 3464 3465 static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b) 3466 { 3467 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3468 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3469 } 3470 3471 static inline vec_char16 vec_lvlx(int a, signed char *b) 3472 { 3473 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3474 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3475 } 3476 3477 static inline vec_char16 vec_lvlx(int a, vec_char16 *b) 3478 { 3479 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3480 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3481 } 3482 3483 static inline vec_ushort8 vec_lvlx(int a, unsigned short *b) 3484 { 3485 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3486 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3487 } 3488 3489 static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b) 3490 { 3491 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3492 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3493 } 3494 3495 static inline vec_short8 vec_lvlx(int a, signed short *b) 3496 { 3497 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3498 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3499 } 3500 3501 static inline vec_short8 vec_lvlx(int a, vec_short8 *b) 3502 { 3503 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3504 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3505 } 3506 3507 static inline vec_uint4 vec_lvlx(int a, unsigned int *b) 3508 { 3509 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3510 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3511 } 3512 3513 static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b) 3514 { 3515 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3516 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3517 } 3518 3519 static inline vec_int4 vec_lvlx(int a, signed int *b) 3520 { 3521 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3522 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3523 } 3524 3525 static inline vec_int4 vec_lvlx(int a, vec_int4 *b) 3526 { 3527 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3528 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3529 } 3530 3531 static inline vec_float4 vec_lvlx(int a, float *b) 3532 { 3533 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3534 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3535 } 3536 3537 static inline vec_float4 vec_lvlx(int a, vec_float4 *b) 3538 { 3539 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3540 return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3541 } 3542 3543 3544 /* vec_lvlxl (load vector left indexed last) 3545 * ========= 3546 */ 3547 #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b) 3548 3549 3550 /* vec_lvrx (load vector right indexed) 3551 * ======== 3552 */ 3553 static inline vec_uchar16 vec_lvrx(int a, unsigned char *b) 3554 { 3555 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3556 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3557 } 3558 3559 static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b) 3560 { 3561 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3562 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3563 } 3564 3565 static inline vec_char16 vec_lvrx(int a, signed char *b) 3566 { 3567 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3568 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3569 } 3570 3571 static inline vec_char16 vec_lvrx(int a, vec_char16 *b) 3572 { 3573 vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3574 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3575 } 3576 3577 static inline vec_ushort8 vec_lvrx(int a, unsigned short *b) 3578 { 3579 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3580 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3581 } 3582 3583 static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b) 3584 { 3585 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3586 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3587 } 3588 3589 static inline vec_short8 vec_lvrx(int a, signed short *b) 3590 { 3591 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3592 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3593 } 3594 3595 static inline vec_short8 vec_lvrx(int a, vec_short8 *b) 3596 { 3597 vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3598 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3599 } 3600 3601 static inline vec_uint4 vec_lvrx(int a, unsigned int *b) 3602 { 3603 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3604 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3605 } 3606 3607 static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b) 3608 { 3609 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3610 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3611 } 3612 3613 static inline vec_int4 vec_lvrx(int a, signed int *b) 3614 { 3615 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3616 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3617 } 3618 3619 static inline vec_int4 vec_lvrx(int a, vec_int4 *b) 3620 { 3621 vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3622 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3623 } 3624 3625 static inline vec_float4 vec_lvrx(int a, float *b) 3626 { 3627 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3628 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3629 } 3630 3631 static inline vec_float4 vec_lvrx(int a, vec_float4 *b) 3632 { 3633 vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3634 return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3635 } 3636 3637 3638 3639 /* vec_lvrxl (load vector right indexed last) 3640 * ========= 3641 */ 3642 #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b) 3643 3644 3645 /* vec_promote (promote scalar to a vector) 3646 * =========== 3647 */ 3648 #define vec_promote(_a, _element) spu_promote(_a, _element) 3649 3650 3651 /* vec_splats (splat scalar to a vector) 3652 * ========== 3653 */ 3654 #define vec_splats(_a) spu_splats(_a) 3655 3656 3657 /* vec_stvlx (store vector left indexed) 3658 * ========= 3659 */ 3660 static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c) 3661 { 3662 int shift; 3663 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3664 3665 shift = -((int)p & 0xF); 3666 *p = spu_sel(*p, 3667 spu_rlmaskqwbyte(a, shift), 3668 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3669 } 3670 3671 static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c) 3672 { 3673 int shift; 3674 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3675 3676 shift = -((int)p & 0xF); 3677 *p = spu_sel(*p, 3678 spu_rlmaskqwbyte(a, shift), 3679 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3680 } 3681 3682 static inline void vec_stvlx(vec_char16 a, int b, signed char *c) 3683 { 3684 int shift; 3685 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3686 3687 shift = -((int)p & 0xF); 3688 *p = spu_sel(*p, 3689 spu_rlmaskqwbyte(a, shift), 3690 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3691 } 3692 3693 static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c) 3694 { 3695 int shift; 3696 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3697 3698 shift = -((int)p & 0xF); 3699 *p = spu_sel(*p, 3700 spu_rlmaskqwbyte(a, shift), 3701 spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3702 } 3703 3704 static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c) 3705 { 3706 int shift; 3707 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3708 3709 shift = -((int)p & 0xF); 3710 *p = spu_sel(*p, 3711 spu_rlmaskqwbyte(a, shift), 3712 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3713 } 3714 3715 static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c) 3716 { 3717 int shift; 3718 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3719 3720 shift = -((int)p & 0xF); 3721 *p = spu_sel(*p, 3722 spu_rlmaskqwbyte(a, shift), 3723 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3724 } 3725 3726 static inline void vec_stvlx(vec_short8 a, int b, signed short *c) 3727 { 3728 int shift; 3729 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3730 3731 shift = -((int)p & 0xF); 3732 *p = spu_sel(*p, 3733 spu_rlmaskqwbyte(a, shift), 3734 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3735 } 3736 3737 static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c) 3738 { 3739 int shift; 3740 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3741 3742 shift = -((int)p & 0xF); 3743 *p = spu_sel(*p, 3744 spu_rlmaskqwbyte(a, shift), 3745 spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3746 } 3747 3748 static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c) 3749 { 3750 int shift; 3751 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3752 3753 shift = -((int)p & 0xF); 3754 *p = spu_sel(*p, 3755 spu_rlmaskqwbyte(a, shift), 3756 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3757 } 3758 3759 static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c) 3760 { 3761 int shift; 3762 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3763 3764 shift = -((int)p & 0xF); 3765 *p = spu_sel(*p, 3766 spu_rlmaskqwbyte(a, shift), 3767 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3768 } 3769 3770 static inline void vec_stvlx(vec_int4 a, int b, signed int *c) 3771 { 3772 int shift; 3773 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3774 3775 shift = -((int)p & 0xF); 3776 *p = spu_sel(*p, 3777 spu_rlmaskqwbyte(a, shift), 3778 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3779 } 3780 3781 static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c) 3782 { 3783 int shift; 3784 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3785 3786 shift = -((int)p & 0xF); 3787 *p = spu_sel(*p, 3788 spu_rlmaskqwbyte(a, shift), 3789 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3790 } 3791 3792 static inline void vec_stvlx(vec_float4 a, int b, float *c) 3793 { 3794 int shift; 3795 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3796 3797 shift = -((int)p & 0xF); 3798 *p = spu_sel(*p, 3799 spu_rlmaskqwbyte(a, shift), 3800 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3801 } 3802 3803 static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c) 3804 { 3805 int shift; 3806 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3807 3808 shift = -((int)p & 0xF); 3809 *p = spu_sel(*p, 3810 spu_rlmaskqwbyte(a, shift), 3811 spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3812 } 3813 3814 /* vec_stvlxl (store vector left indexed last) 3815 * ========== 3816 */ 3817 #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c) 3818 3819 3820 /* vec_stvrx (store vector right indexed) 3821 * ========= 3822 */ 3823 static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c) 3824 { 3825 int shift; 3826 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3827 3828 shift = 16-((int)p & 0xF); 3829 *p = spu_sel(*p, 3830 spu_slqwbyte(a, shift), 3831 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3832 } 3833 3834 static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c) 3835 { 3836 int shift; 3837 vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3838 3839 shift = 16-((int)p & 0xF); 3840 *p = spu_sel(*p, 3841 spu_slqwbyte(a, shift), 3842 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3843 } 3844 3845 static inline void vec_stvrx(vec_char16 a, int b, signed char *c) 3846 { 3847 int shift; 3848 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3849 3850 shift = 16-((int)p & 0xF); 3851 *p = spu_sel(*p, 3852 spu_slqwbyte(a, shift), 3853 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3854 } 3855 3856 static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c) 3857 { 3858 int shift; 3859 vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3860 3861 shift = 16-((int)p & 0xF); 3862 *p = spu_sel(*p, 3863 spu_slqwbyte(a, shift), 3864 spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3865 } 3866 3867 static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c) 3868 { 3869 int shift; 3870 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3871 3872 shift = 16-((int)p & 0xF); 3873 *p = spu_sel(*p, 3874 spu_slqwbyte(a, shift), 3875 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3876 } 3877 3878 static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c) 3879 { 3880 int shift; 3881 vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3882 3883 shift = 16-((int)p & 0xF); 3884 *p = spu_sel(*p, 3885 spu_slqwbyte(a, shift), 3886 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3887 } 3888 3889 static inline void vec_stvrx(vec_short8 a, int b, signed short *c) 3890 { 3891 int shift; 3892 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3893 3894 shift = 16-((int)p & 0xF); 3895 *p = spu_sel(*p, 3896 spu_slqwbyte(a, shift), 3897 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3898 } 3899 3900 static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c) 3901 { 3902 int shift; 3903 vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3904 3905 shift = 16-((int)p & 0xF); 3906 *p = spu_sel(*p, 3907 spu_slqwbyte(a, shift), 3908 spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3909 } 3910 3911 static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c) 3912 { 3913 int shift; 3914 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3915 3916 shift = 16-((int)p & 0xF); 3917 *p = spu_sel(*p, 3918 spu_slqwbyte(a, shift), 3919 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3920 } 3921 3922 static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c) 3923 { 3924 int shift; 3925 vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3926 3927 shift = 16-((int)p & 0xF); 3928 *p = spu_sel(*p, 3929 spu_slqwbyte(a, shift), 3930 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3931 } 3932 3933 static inline void vec_stvrx(vec_int4 a, int b, signed int *c) 3934 { 3935 int shift; 3936 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3937 3938 shift = 16-((int)p & 0xF); 3939 *p = spu_sel(*p, 3940 spu_slqwbyte(a, shift), 3941 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3942 } 3943 3944 static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c) 3945 { 3946 int shift; 3947 vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3948 3949 shift = 16-((int)p & 0xF); 3950 *p = spu_sel(*p, 3951 spu_slqwbyte(a, shift), 3952 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3953 } 3954 3955 static inline void vec_stvrx(vec_float4 a, int b, float *c) 3956 { 3957 int shift; 3958 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3959 3960 shift = 16-((int)p & 0xF); 3961 *p = spu_sel(*p, 3962 spu_slqwbyte(a, shift), 3963 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3964 } 3965 3966 static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c) 3967 { 3968 int shift; 3969 vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3970 3971 shift = 16-((int)p & 0xF); 3972 *p = spu_sel(*p, 3973 spu_slqwbyte(a, shift), 3974 spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3975 } 3976 3977 /* vec_stvrxl (store vector right indexed last) 3978 * ========== 3979 */ 3980 #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c) 3981 3982 3983 #endif /* __SPU__ */ 3984 #endif /* __cplusplus */ 3985 #endif /* !_VMX2SPU_H_ */ 3986