[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
Re: [pygame] New strict MMX patch for transform.c
- To: pygame-users@xxxxxxxx
- Subject: Re: [pygame] New strict MMX patch for transform.c
- From: "René Dudfield" <renesd@xxxxxxxxx>
- Date: Sun, 26 Aug 2007 14:28:09 +1000
- Delivered-to: archiver@xxxxxxxx
- Delivered-to: pygame-users-outgoing@xxxxxxxx
- Delivered-to: pygame-users@xxxxxxxx
- Delivery-date: Sun, 26 Aug 2007 00:28:20 -0400
- Dkim-signature: a=rsa-sha1; c=relaxed/relaxed; d=gmail.com; s=beta; h=domainkey-signature:received:received:message-id:date:from:to:subject:in-reply-to:mime-version:content-type:content-transfer-encoding:content-disposition:references; b=OkDodKl0TK/BbbDcwsn9yZmZhwfIgUao9CjonCkcaCztCFzdOw6GtL4mYhN4eLz/tDf+hGszxefBYYysV1Hf9HRyn+00mVB8GsubcZqjKZZGz6GOwdMpbI2cZZtYqcNfiLrn6z8HFq5BX2vpzYVFtzab9ORMIl+vfVFgH8NTKQ8=
- Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=beta; h=received:message-id:date:from:to:subject:in-reply-to:mime-version:content-type:content-transfer-encoding:content-disposition:references; b=SH8rqnn2RsOeD6hlwmCZ83AafNlnGTEOG8xzk2ZTpFvXMDUn+1ym2spqW7A6ujn5xnJ/9Q03ZG2mixNGUggNuyhl1xv2LxaDT3UBZQefar/Pvuyt0u3MFFD7MyCPjglwpzZ4vWEt1w6fkpMAJ2wQFdB15kWcJfGsn/vcGTiRUFk=
- In-reply-to: <46D0A66A.3080703@xxxxxxxxx>
- References: <46D0A66A.3080703@xxxxxxxxx>
- Reply-to: pygame-users@xxxxxxxx
- Sender: owner-pygame-users@xxxxxxxx
Awesome. Thanks :)
Committed revision 1048.
On 8/26/07, Lenard Lindstrom <len-l@xxxxxxxxx> wrote:
> I am including a new patch for transform.c . It replaces the patches I
> submitted earlier. It cleans up the assembly code and and omits the
> NO_SSE flag. Profiling shows that filter_shrink_X_MMX is still nearly 4X
> faster than filter_shrink_X_ONLYC. No extra tweaking, by interleaving
> instructions, was done since that showed a less that 1% performance
> change (below the margin of error for the profile).
>
> --
> Lenard Lindstrom
> <len-l@xxxxxxxxx>
>
>
> 1018c1018
> < long long One64 = 0x4000400040004000;
> ---
> > long long One64 = 0x4000400040004000ULL;
> 1025,1026c1025,1026
> < " movq %2, %%mm6; " /* mm6 = 2^14 */
> < " pshufw $0, %%mm7, %%mm7; "
> ---
> > " punpcklwd %%mm7, %%mm7; "
> > " punpckldq %%mm7, %%mm7; "
> 1042,1043c1042,1044
> < " movq %%mm6, %%mm3; " /* mm3 = 2^14 */
> < " pshufw $0, %%mm2, %%mm2; "
> ---
> > " movq %2, %%mm3; " /* mm3 = 2^14 */
> > " punpcklwd %%mm2, %%mm2; "
> > " punpckldq %%mm2, %%mm2; "
> 1049,1050c1050,1067
> < " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * xcounter >> 16) */
> < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * xfrac) >> 16 */
> ---
> > " movq %%mm4, %%mm5; " /* mm2 = (srcpix * xcounter >> 16) */
> > " psraw $15, %%mm5; "
> > " pand %%mm2, %%mm5; "
> > " movq %%mm2, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm4, %%mm6; "
> > " pmulhw %%mm4, %%mm2; "
> > " paddw %%mm5, %%mm2; "
> > " paddw %%mm6, %%mm2; "
> > " movq %%mm4, %%mm5; " /* mm3 = (srcpix * xfrac) >> 16) */
> > " psraw $15, %%mm5; "
> > " pand %%mm3, %%mm5; "
> > " movq %%mm3, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm4, %%mm6; "
> > " pmulhw %%mm4, %%mm3; "
> > " paddw %%mm5, %%mm3; "
> > " paddw %%mm6, %%mm3; "
> 1053c1070,1078
> < " pmulhuw %%mm7, %%mm2; "
> ---
> > " movq %%mm7, %%mm5; "
> > " psraw $15, %%mm5; "
> > " pand %%mm2, %%mm5; "
> > " movq %%mm2, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm7, %%mm6; "
> > " pmulhw %%mm7, %%mm2; "
> > " paddw %%mm5, %%mm2; "
> > " paddw %%mm6, %%mm2; "
> 1076,1077c1101,1102
> < " movq %2, %%mm6; " /* mm6 = 2^14 */
> < " pshufw $0, %%mm7, %%mm7; "
> ---
> > " punpcklwd %%mm7, %%mm7; "
> > " punpckldq %%mm7, %%mm7; "
> 1093,1094c1118,1120
> < " movq %%mm6, %%mm3; " /* mm3 = 2^14 */
> < " pshufw $0, %%mm2, %%mm2; "
> ---
> > " movq %2, %%mm3; " /* mm3 = 2^14 */
> > " punpcklwd %%mm2, %%mm2; "
> > " punpckldq %%mm2, %%mm2; "
> 1100,1101c1126,1143
> < " pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * xcounter >> 16) */
> < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * xfrac) >> 16 */
> ---
> > " movq %%mm4, %%mm5; " /* mm2 = (srcpix * xcounter >> 16) */
> > " psraw $15, %%mm5; "
> > " pand %%mm2, %%mm5; "
> > " movq %%mm2, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm4, %%mm6; "
> > " pmulhw %%mm4, %%mm2; "
> > " paddw %%mm5, %%mm2; "
> > " paddw %%mm6, %%mm2; "
> > " movq %%mm4, %%mm5; " /* mm3 = (srcpix * xfrac) >> 16) */
> > " psraw $15, %%mm5; "
> > " pand %%mm3, %%mm5; "
> > " movq %%mm3, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm4, %%mm6; "
> > " pmulhw %%mm4, %%mm3; "
> > " paddw %%mm5, %%mm3; "
> > " paddw %%mm6, %%mm3; "
> 1104c1146,1154
> < " pmulhuw %%mm7, %%mm2; "
> ---
> > " movq %%mm7, %%mm5; "
> > " psraw $15, %%mm5; "
> > " pand %%mm2, %%mm5; "
> > " movq %%mm2, %%mm6; "
> > " psraw $15, %%mm6; "
> > " pand %%mm7, %%mm6; "
> > " pmulhw %%mm7, %%mm2; "
> > " paddw %%mm5, %%mm2; "
> > " paddw %%mm6, %%mm2; "
> 1202c1252
> < long long One64 = 0x4000400040004000;
> ---
> > long long One64 = 0x4000400040004000ULL;
> 1210c1260,1261
> < " pshufw $0, %%mm7, %%mm7; "
> ---
> > " punpcklwd %%mm7, %%mm7; "
> > " punpckldq %%mm7, %%mm7; "
> 1232c1283,1284
> < " pshufw $0, %%mm1, %%mm1; "
> ---
> > " punpcklwd %%mm1, %%mm1; "
> > " punpckldq %%mm1, %%mm1; "
> 1241,1242c1293,1310
> < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * yfrac) >> 16 */
> < " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * ycounter >> 16) */
> ---
> > " movq %%mm4, %%mm0; " /* mm3 = (srcpix * yfrac) >> 16) */
> > " psraw $15, %%mm0; "
> > " pand %%mm3, %%mm0; "
> > " movq %%mm3, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm4, %%mm2; "
> > " pmulhw %%mm4, %%mm3; "
> > " paddw %%mm0, %%mm3; "
> > " paddw %%mm2, %%mm3; "
> > " movq %%mm1, %%mm0; " /* mm4 = (srcpix * ycounter >> 16) */
> > " psraw $15, %%mm0; "
> > " pand %%mm4, %%mm0; "
> > " movq %%mm4, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm1, %%mm2; "
> > " pmulhw %%mm1, %%mm4; "
> > " paddw %%mm0, %%mm4; "
> > " paddw %%mm2, %%mm4; "
> 1246c1314,1323
> < " pmulhuw %%mm7, %%mm4; "
> ---
> > " movq %%mm7, %%mm0; "
> > " psraw $15, %%mm0; "
> > " pand %%mm4, %%mm0; "
> > " movq %%mm4, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm7, %%mm2; "
> > " pmulhw %%mm7, %%mm4; "
> > " paddw %%mm0, %%mm4; "
> > " paddw %%mm2, %%mm4; "
> > " pxor %%mm0, %%mm0; "
> 1270c1347,1348
> < " pshufw $0, %%mm7, %%mm7; "
> ---
> > " punpcklwd %%mm7, %%mm7; "
> > " punpckldq %%mm7, %%mm7; "
> 1292c1370,1371
> < " pshufw $0, %%mm1, %%mm1; "
> ---
> > " punpcklwd %%mm1, %%mm1; "
> > " punpckldq %%mm1, %%mm1; "
> 1301,1302c1380,1397
> < " pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * yfrac) >> 16 */
> < " pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * ycounter >> 16) */
> ---
> > " movq %%mm4, %%mm0; " /* mm3 = (srcpix * yfrac) >> 16) */
> > " psraw $15, %%mm0; "
> > " pand %%mm3, %%mm0; "
> > " movq %%mm3, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm4, %%mm2; "
> > " pmulhw %%mm4, %%mm3; "
> > " paddw %%mm0, %%mm3; "
> > " paddw %%mm2, %%mm3; "
> > " movq %%mm1, %%mm0; " /* mm4 = (srcpix * ycounter >> 16) */
> > " psraw $15, %%mm0; "
> > " pand %%mm4, %%mm0; "
> > " movq %%mm4, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm1, %%mm2; "
> > " pmulhw %%mm1, %%mm4; "
> > " paddw %%mm0, %%mm4; "
> > " paddw %%mm2, %%mm4; "
> 1306c1401,1410
> < " pmulhuw %%mm7, %%mm4; "
> ---
> > " movq %%mm7, %%mm0; "
> > " psraw $15, %%mm0; "
> > " pand %%mm4, %%mm0; "
> > " movq %%mm4, %%mm2; "
> > " psraw $15, %%mm2; "
> > " pand %%mm7, %%mm2; "
> > " pmulhw %%mm7, %%mm4; "
> > " paddw %%mm0, %%mm4; "
> > " paddw %%mm2, %%mm4; "
> > " pxor %%mm0, %%mm0; "
> 1535,1536c1639,1642
> < " pshufw $0, %%mm1, %%mm1; "
> < " pshufw $0, %%mm2, %%mm2; "
> ---
> > " punpcklwd %%mm1, %%mm1; "
> > " punpckldq %%mm1, %%mm1; "
> > " punpcklwd %%mm2, %%mm2; "
> > " punpckldq %%mm2, %%mm2; "
> 1564,1565c1670,1673
> < " pshufw $0, %%mm1, %%mm1; "
> < " pshufw $0, %%mm2, %%mm2; "
> ---
> > " punpcklwd %%mm1, %%mm1; "
> > " punpckldq %%mm1, %%mm1; "
> > " punpcklwd %%mm2, %%mm2; "
> > " punpckldq %%mm2, %%mm2; "
>
>