Trying to be more flexible in register use so the compiler can do reloads

* app/composite/gimp-composite-sse2.c:
Trying to be more flexible in register use so the compiler can do
reloads without running out of registers when using optimisation
modes other than 2.

Avoid the message "error: can't find a register in class
`GENERAL_REGS' while reloading `asm'"

* app/composite/gimp-composite-x86.h
Use more newlines in asm() macros to ensure that gcc gets the
instruction count correct.  This is partially complete as of this
commit.
This commit is contained in:
Helvetix Victorinox 2005-05-17 17:24:26 +00:00
parent faa0a60cb1
commit f41b1f24f6
5 changed files with 98 additions and 286 deletions

View file

@ -1,3 +1,19 @@
2005-05-17 Helvetix Victorinox <helvetix@gimp.org>
* app/composite/gimp-composite-sse2.c:
Trying to be more flexible in register use so the compiler can do
reloads without running out of registers when using optimisation
modes other than 2.
Avoid the message "error: can't find a register in class
`GENERAL_REGS' while reloading `asm'"
* app/composite/gimp-composite-x86.h
Use more newlines in asm() macros to ensure that gcc gets the
instruction count correct. This is partially complete as of this
commit.
2005-05-17 Sven Neumann <sven@gimp.org> 2005-05-17 Sven Neumann <sven@gimp.org>
* configure.in: bumped version number to 2.3.1. * configure.in: bumped version number to 2.3.1.

View file

@ -271,7 +271,7 @@ gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm2, %0\n" "\tmovq %%mm2, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
a++; a++;
b++; b++;
d++; d++;
@ -318,7 +318,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1, %0\n" "\tmovq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
a++; a++;
b++; b++;
d++; d++;
@ -341,7 +341,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n" "\tmovd %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -591,7 +591,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1,%0\n" "\tmovq %%mm1,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
a++; a++;
b++; b++;
d++; d++;
@ -622,7 +622,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n" "\tmovd %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -669,7 +669,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1, %0\n" "\tmovq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
a++; a++;
b++; b++;
d++; d++;
@ -699,7 +699,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n" "\tmovd %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4");
} }
asm("emms"); asm("emms");
@ -1048,7 +1048,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1,%0\n" "\tmovq %%mm1,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a) : "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
a++; a++;
d++; d++;
} }
@ -1066,7 +1066,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1,%0\n" "\tmovd %%mm1,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a) : "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
} }
asm("emms"); asm("emms");
@ -1267,8 +1267,8 @@ gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{ {
asm volatile (" movq %0,%%mm2\n" asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n" "\tmovq %1,%%mm3\n"
"\tmovq %%mm3,%0\n" "\tmovntq %%mm3,%0\n"
"\tmovq %%mm2,%1\n" "\tmovntq %%mm2,%1\n"
: "+m" (*a), "+m" (*b) : "+m" (*a), "+m" (*b)
: :
: "%mm2", "%mm3"); : "%mm2", "%mm3");

View file

@ -292,10 +292,10 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpminub %%mm3, %%mm2\n" "\tpminub %%mm3, %%mm2\n"
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n" "\tmovntq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++; a++;
b++; b++;
d++; d++;
@ -318,206 +318,12 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n" "\tmovd %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
} }
asm("emms"); asm("emms");
} }
#if 0
void
xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
asm volatile (" movq %0, %%mm0\n"
"\tmovq %1, %%mm7\n"
:
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
: "%mm0", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
{
asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm6,%%mm6\n"
"\tpunpckhbw %%mm6,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\tpminub %%mm0,%%mm1\n"
"\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n"
"\tpand %%mm2,%%mm1\n"
"\tpor %%mm1,%%mm3\n"
"\tmovq %%mm3,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm6,%%mm6\n"
"\tpunpckhbw %%mm6,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\tpminub %%mm0,%%mm1\n"
"\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n"
"\tpand %%mm2,%%mm1\n"
"\tpor %%mm1,%%mm3\n"
"\tmovd %%mm3,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
#endif
#if 0
void
xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
for (; op.n_pixels >= 2; op.n_pixels -= 2)
{
asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm2,%%mm3\n"
"\tpunpcklbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm2,%%mm3\n"
"\tpunpckhbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
"\tpackuswb %%mm6,%%mm5\n"
"\tmovq %4,%%mm6\n"
"\tmovq %%mm1,%%mm7\n"
"\t" pminub(mm0,mm7,mm2) "\n"
"\tpand %%mm6,%%mm7\n"
"\tpandn %%mm5,%%mm6\n"
"\tpor %%mm6,%%mm7\n"
"\tmovq %%mm7,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm2,%%mm3\n"
"\tpunpcklbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm2,%%mm3\n"
"\tpunpckhbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
"\tpackuswb %%mm6,%%mm5\n"
"\tmovq %4,%%mm6\n"
"\tmovq %%mm1,%%mm7\n"
"\tpminub %%mm0,%%mm7\n"
"\tpand %%mm6,%%mm7\n"
"\tpandn %%mm5,%%mm6\n"
"\tpor %%mm6,%%mm7\n"
"\tmovd %%mm7,%2\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
#endif
void void
gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op) gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
@ -562,7 +368,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n" "\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n" "\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%0\n" "\tmovntq %%mm1,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
@ -641,7 +447,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tmovq %%mm0, %%mm1\n" "\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n" "\tpandn %%mm4, %%mm1\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n" "\tmovntq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4"); : "%mm1", "%mm2", "%mm3", "%mm4");
@ -702,7 +508,7 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpminub %%mm2, %%mm3\n" "\tpminub %%mm2, %%mm3\n"
"\tpand %%mm0, %%mm3\n" "\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n" "\tpor %%mm3, %%mm1\n"
"\tmovq %%mm1, %0\n" "\tmovntq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
@ -765,7 +571,7 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n" "\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n" "\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n" "\tmovntq %%mm1, %0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a), "m" (*b) : "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5"); : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
@ -1009,7 +815,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm1\n" "\tpackuswb %%mm4,%%mm1\n"
"\tmovq %%mm1,%0\n" "\tmovntq %%mm1,%0\n"
: "=m" (*d) : "=m" (*d)
: "m" (*a) : "m" (*a)
: "%mm1", "%mm2", "%mm4", "%mm5", "%mm7"); : "%mm1", "%mm2", "%mm4", "%mm5", "%mm7");
@ -1044,9 +850,13 @@ gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
uint64 *b = (uint64 *) _op->B; uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels; gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0"); asm volatile ("pxor %%mm6,%%mm6\n"
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7"); "movq %0,%%mm0\n"
asm volatile ("pxor %mm6, %mm6"); "movq %1,%%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
for (; n_pixels >= 2; n_pixels -= 2) for (; n_pixels >= 2; n_pixels -= 2)
{ {

View file

@ -605,46 +605,32 @@ gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op)
GimpCompositeContext op = *_op; GimpCompositeContext op = *_op;
/* /*
* Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8 * Inhale one whole i686 cache line at once. 128 bytes == 32 rgba8
* pixels, 4 128 bit xmm registers. * pixels == 8 128 bit xmm registers.
*/ */
for (; op.n_pixels >= 16; op.n_pixels -= 16) for (; op.n_pixels >= 16; op.n_pixels -= 16)
{ {
asm volatile (" movdqu %0,%%xmm0\n" asm volatile (" movdqu %0,%%xmm0\n" : :"m" (op.A[0]) : "%xmm0");
"\tmovdqu %1,%%xmm1\n" asm volatile (" movdqu %0,%%xmm1\n" : :"m" (op.B[0]) : "%xmm1");
"\tmovdqu %2,%%xmm2\n" asm volatile (" movdqu %0,%%xmm2\n" : :"m" (op.A[1]) : "%xmm2");
"\tmovdqu %3,%%xmm3\n" asm volatile (" movdqu %0,%%xmm3\n" : :"m" (op.B[1]) : "%xmm3");
"\tmovdqu %4,%%xmm4\n" asm volatile (" movdqu %0,%%xmm4\n" : :"m" (op.A[2]) : "%xmm4");
"\tmovdqu %5,%%xmm5\n" asm volatile (" movdqu %0,%%xmm5\n" : :"m" (op.B[2]) : "%xmm5");
"\tmovdqu %6,%%xmm6\n" asm volatile (" movdqu %0,%%xmm6\n" : :"m" (op.A[3]) : "%xmm6");
"\tmovdqu %7,%%xmm7\n" asm volatile (" movdqu %0,%%xmm7\n" : :"m" (op.B[3]) : "%xmm7");
:
: "m" (op.A[0]), "m" (op.B[0]),
"m" (op.A[1]), "m" (op.B[1]),
"m" (op.A[2]), "m" (op.B[2]),
"m" (op.A[3]), "m" (op.B[3])
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
asm volatile ("\tmovdqu %%xmm0,%1\n" asm volatile ("\tmovdqu %%xmm0,%0\n" : "=m" (op.A[0]));
"\tmovdqu %%xmm1,%0\n" asm volatile ("\tmovdqu %%xmm1,%0\n" : "=m" (op.B[0]));
"\tmovdqu %%xmm2,%3\n" asm volatile ("\tmovdqu %%xmm2,%0\n" : "=m" (op.A[1]));
"\tmovdqu %%xmm3,%2\n" asm volatile ("\tmovdqu %%xmm3,%0\n" : "=m" (op.B[1]));
"\tmovdqu %%xmm4,%5\n" asm volatile ("\tmovdqu %%xmm4,%0\n" : "=m" (op.A[2]));
"\tmovdqu %%xmm5,%4\n" asm volatile ("\tmovdqu %%xmm5,%0\n" : "=m" (op.B[2]));
"\tmovdqu %%xmm6,%7\n" asm volatile ("\tmovdqu %%xmm6,%0\n" : "=m" (op.A[3]));
"\tmovdqu %%xmm7,%6\n" asm volatile ("\tmovdqu %%xmm7,%0\n" : "=m" (op.B[3]));
: "=m" (op.A[0]), "=m" (op.B[0]),
"=m" (op.A[1]), "=m" (op.B[1]),
"=m" (op.A[2]), "=m" (op.B[2]),
"=m" (op.A[3]), "=m" (op.B[3])
: /* empty */
);
op.A += 64; op.A += 64;
op.B += 64; op.B += 64;
} }
for (; op.n_pixels >= 4; op.n_pixels -= 4) for (; op.n_pixels >= 4; op.n_pixels -= 4)
{ {
asm volatile (" movdqu %0,%%xmm2\n" asm volatile (" movdqu %0,%%xmm2\n"

View file

@ -25,14 +25,14 @@
* Convert the low 8bit byte of the src to 16bit words in dst. * Convert the low 8bit byte of the src to 16bit words in dst.
*/ */
#define mmx_low_bytes_to_words(src,dst,zero) \ #define mmx_low_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \ "\tmovq %%"#src", %%"#dst"\n" \
"\tpunpcklbw %%"#zero", %%"#dst"\n" "\tpunpcklbw %%"#zero", %%"#dst"\n"
/* /*
* Convert the high 8bit byte of the src to 16bit words in dst. * Convert the high 8bit byte of the src to 16bit words in dst.
*/ */
#define mmx_high_bytes_to_words(src,dst,zero) \ #define mmx_high_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \ "\tmovq %%"#src", %%"#dst"\n" \
"\tpunpckhbw %%"#zero", %%"#dst"\n" "\tpunpckhbw %%"#zero", %%"#dst"\n"
#define xmm_low_bytes_to_words(src,dst,zero) \ #define xmm_low_bytes_to_words(src,dst,zero) \
@ -65,18 +65,18 @@
* (high-order bit of each word is cleared) * (high-order bit of each word is cleared)
* Clobbers eax, ecx edx * Clobbers eax, ecx edx
*/ */
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \ #define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax\n" \
"movd %%" #divisor ",%%ecx; " \ "movd %%" #divisor ",%%ecx\n" \
"xorl %%edx,%%edx; " \ "xorl %%edx,%%edx\n" \
"divw %%cx; " \ "divw %%cx\n" \
"roll $16, %%eax; " \ "roll $16, %%eax\n" \
"roll $16, %%ecx; " \ "roll $16, %%ecx\n" \
"xorl %%edx,%%edx; " \ "xorl %%edx,%%edx\n" \
"divw %%cx; " \ "divw %%cx\n" \
"btr $15, %%eax; " \ "btr $15, %%eax\n" \
"roll $16, %%eax; " \ "roll $16, %%eax\n" \
"btr $15, %%eax; " \ "btr $15, %%eax\n" \
"movd %%eax,%%" #quotient ";" "movd %%eax,%%" #quotient "\n"