[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[pygame] New strict MMX patch for transform.c
I am including a new patch for transform.c . It replaces the patches I
submitted earlier. It cleans up the assembly code and and omits the
NO_SSE flag. Profiling shows that filter_shrink_X_MMX is still nearly 4X
faster than filter_shrink_X_ONLYC. No extra tweaking, by interleaving
instructions, was done since that showed a less that 1% performance
change (below the margin of error for the profile).
--
Lenard Lindstrom
<len-l@xxxxxxxxx>
1018c1018
< long long One64 = 0x4000400040004000;
---
> long long One64 = 0x4000400040004000ULL;
1025,1026c1025,1026
< " movq %2, %%mm6; " /* mm6 = 2^14 */
< " pshufw $0, %%mm7, %%mm7; "
---
> " punpcklwd %%mm7, %%mm7; "
> " punpckldq %%mm7, %%mm7; "
1042,1043c1042,1044
< " movq %%mm6, %%mm3; " /* mm3 = 2^14 */
< " pshufw $0, %%mm2, %%mm2; "
---
> " movq %2, %%mm3; " /* mm3 = 2^14 */
> " punpcklwd %%mm2, %%mm2; "
> " punpckldq %%mm2, %%mm2; "
1049,1050c1050,1067
< " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * xcounter >> 16) */
< " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * xfrac) >> 16 */
---
> " movq %%mm4, %%mm5; " /* mm2 = (srcpix * xcounter >> 16) */
> " psraw $15, %%mm5; "
> " pand %%mm2, %%mm5; "
> " movq %%mm2, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm4, %%mm6; "
> " pmulhw %%mm4, %%mm2; "
> " paddw %%mm5, %%mm2; "
> " paddw %%mm6, %%mm2; "
> " movq %%mm4, %%mm5; " /* mm3 = (srcpix * xfrac) >> 16) */
> " psraw $15, %%mm5; "
> " pand %%mm3, %%mm5; "
> " movq %%mm3, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm4, %%mm6; "
> " pmulhw %%mm4, %%mm3; "
> " paddw %%mm5, %%mm3; "
> " paddw %%mm6, %%mm3; "
1053c1070,1078
< " pmulhuw %%mm7, %%mm2; "
---
> " movq %%mm7, %%mm5; "
> " psraw $15, %%mm5; "
> " pand %%mm2, %%mm5; "
> " movq %%mm2, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm7, %%mm6; "
> " pmulhw %%mm7, %%mm2; "
> " paddw %%mm5, %%mm2; "
> " paddw %%mm6, %%mm2; "
1076,1077c1101,1102
< " movq %2, %%mm6; " /* mm6 = 2^14 */
< " pshufw $0, %%mm7, %%mm7; "
---
> " punpcklwd %%mm7, %%mm7; "
> " punpckldq %%mm7, %%mm7; "
1093,1094c1118,1120
< " movq %%mm6, %%mm3; " /* mm3 = 2^14 */
< " pshufw $0, %%mm2, %%mm2; "
---
> " movq %2, %%mm3; " /* mm3 = 2^14 */
> " punpcklwd %%mm2, %%mm2; "
> " punpckldq %%mm2, %%mm2; "
1100,1101c1126,1143
< " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * xcounter >> 16) */
< " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * xfrac) >> 16 */
---
> " movq %%mm4, %%mm5; " /* mm2 = (srcpix * xcounter >> 16) */
> " psraw $15, %%mm5; "
> " pand %%mm2, %%mm5; "
> " movq %%mm2, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm4, %%mm6; "
> " pmulhw %%mm4, %%mm2; "
> " paddw %%mm5, %%mm2; "
> " paddw %%mm6, %%mm2; "
> " movq %%mm4, %%mm5; " /* mm3 = (srcpix * xfrac) >> 16) */
> " psraw $15, %%mm5; "
> " pand %%mm3, %%mm5; "
> " movq %%mm3, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm4, %%mm6; "
> " pmulhw %%mm4, %%mm3; "
> " paddw %%mm5, %%mm3; "
> " paddw %%mm6, %%mm3; "
1104c1146,1154
< " pmulhuw %%mm7, %%mm2; "
---
> " movq %%mm7, %%mm5; "
> " psraw $15, %%mm5; "
> " pand %%mm2, %%mm5; "
> " movq %%mm2, %%mm6; "
> " psraw $15, %%mm6; "
> " pand %%mm7, %%mm6; "
> " pmulhw %%mm7, %%mm2; "
> " paddw %%mm5, %%mm2; "
> " paddw %%mm6, %%mm2; "
1202c1252
< long long One64 = 0x4000400040004000;
---
> long long One64 = 0x4000400040004000ULL;
1210c1260,1261
< " pshufw $0, %%mm7, %%mm7; "
---
> " punpcklwd %%mm7, %%mm7; "
> " punpckldq %%mm7, %%mm7; "
1232c1283,1284
< " pshufw $0, %%mm1, %%mm1; "
---
> " punpcklwd %%mm1, %%mm1; "
> " punpckldq %%mm1, %%mm1; "
1241,1242c1293,1310
< " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * yfrac) >> 16 */
< " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * ycounter >> 16) */
---
> " movq %%mm4, %%mm0; " /* mm3 = (srcpix * yfrac) >> 16) */
> " psraw $15, %%mm0; "
> " pand %%mm3, %%mm0; "
> " movq %%mm3, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm4, %%mm2; "
> " pmulhw %%mm4, %%mm3; "
> " paddw %%mm0, %%mm3; "
> " paddw %%mm2, %%mm3; "
> " movq %%mm1, %%mm0; " /* mm4 = (srcpix * ycounter >> 16) */
> " psraw $15, %%mm0; "
> " pand %%mm4, %%mm0; "
> " movq %%mm4, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm1, %%mm2; "
> " pmulhw %%mm1, %%mm4; "
> " paddw %%mm0, %%mm4; "
> " paddw %%mm2, %%mm4; "
1246c1314,1323
< " pmulhuw %%mm7, %%mm4; "
---
> " movq %%mm7, %%mm0; "
> " psraw $15, %%mm0; "
> " pand %%mm4, %%mm0; "
> " movq %%mm4, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm7, %%mm2; "
> " pmulhw %%mm7, %%mm4; "
> " paddw %%mm0, %%mm4; "
> " paddw %%mm2, %%mm4; "
> " pxor %%mm0, %%mm0; "
1270c1347,1348
< " pshufw $0, %%mm7, %%mm7; "
---
> " punpcklwd %%mm7, %%mm7; "
> " punpckldq %%mm7, %%mm7; "
1292c1370,1371
< " pshufw $0, %%mm1, %%mm1; "
---
> " punpcklwd %%mm1, %%mm1; "
> " punpckldq %%mm1, %%mm1; "
1301,1302c1380,1397
< " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * yfrac) >> 16 */
< " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * ycounter >> 16) */
---
> " movq %%mm4, %%mm0; " /* mm3 = (srcpix * yfrac) >> 16) */
> " psraw $15, %%mm0; "
> " pand %%mm3, %%mm0; "
> " movq %%mm3, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm4, %%mm2; "
> " pmulhw %%mm4, %%mm3; "
> " paddw %%mm0, %%mm3; "
> " paddw %%mm2, %%mm3; "
> " movq %%mm1, %%mm0; " /* mm4 = (srcpix * ycounter >> 16) */
> " psraw $15, %%mm0; "
> " pand %%mm4, %%mm0; "
> " movq %%mm4, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm1, %%mm2; "
> " pmulhw %%mm1, %%mm4; "
> " paddw %%mm0, %%mm4; "
> " paddw %%mm2, %%mm4; "
1306c1401,1410
< " pmulhuw %%mm7, %%mm4; "
---
> " movq %%mm7, %%mm0; "
> " psraw $15, %%mm0; "
> " pand %%mm4, %%mm0; "
> " movq %%mm4, %%mm2; "
> " psraw $15, %%mm2; "
> " pand %%mm7, %%mm2; "
> " pmulhw %%mm7, %%mm4; "
> " paddw %%mm0, %%mm4; "
> " paddw %%mm2, %%mm4; "
> " pxor %%mm0, %%mm0; "
1535,1536c1639,1642
< " pshufw $0, %%mm1, %%mm1; "
< " pshufw $0, %%mm2, %%mm2; "
---
> " punpcklwd %%mm1, %%mm1; "
> " punpckldq %%mm1, %%mm1; "
> " punpcklwd %%mm2, %%mm2; "
> " punpckldq %%mm2, %%mm2; "
1564,1565c1670,1673
< " pshufw $0, %%mm1, %%mm1; "
< " pshufw $0, %%mm2, %%mm2; "
---
> " punpcklwd %%mm1, %%mm1; "
> " punpckldq %%mm1, %%mm1; "
> " punpcklwd %%mm2, %%mm2; "
> " punpckldq %%mm2, %%mm2; "