[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: gcc-3.1.1
Tsubai Masanari <tsubai@iri.co.jp> writes:
> は
>
> lwz 7,0(%2)
> lwz 8,4(%2)
> lwz 9,8(%2)
> lwz 10,12(%2)
> ... 何度か繰り返し
>
> (ループ内の最後で)
> addi %2,%2,64
>
> のような形にした方がいいということです。
下のように変更して, 手許の Mac (500DP) で, とある data に in4_cksum を
1000000 回かけてみたところ, user time は 13.9s から 12.4s に減りました.
でも cpu によっては lwzu のほうが速かったりするとしたら, 単にこう変更
するわけにはいきませんね.
enami.
Index: in_cksum.c
===================================================================
RCS file: /cvsroot/syssrc/sys/arch/powerpc/powerpc/in_cksum.c,v
retrieving revision 1.3
diff -u -r1.3 in_cksum.c
--- in_cksum.c 2001/06/13 06:01:50 1.3
+++ in_cksum.c 2002/07/29 09:57:53
@@ -103,7 +103,7 @@
* Force to a word boundary.
*/
if ((3 & (long) w) && (mlen > 0)) {
- if ((1 & (long) w) && (mlen > 0)) {
+ if ((1 & (long) w)) {
REDUCE;
sum <<= 8;
s_util.c[0] = *w++;
@@ -111,7 +111,18 @@
byte_swapped = 1;
}
if ((2 & (long) w) && (mlen > 1)) {
- sum += *(uint16_t *)w;
+ /*
+ * Since the `sum' may contain full 32 bit
+ * value, we can't simply add any value.
+ */
+ __asm __volatile(
+ "lhz 7,0(%1);" /* load current data
+ half word */
+ "addc %0,%0,7;" /* add to sum */
+ "addze %0,%0;" /* add carry bit */
+ : "+r"(sum)
+ : "b"(w)
+ : "7"); /* clobber r7 */
w += 2;
mlen -= 2;
}
@@ -119,75 +130,72 @@
if (mlen >= 64) {
register int n __asm("r0");
- uint8_t *tmpw;
n = mlen >> 6;
- tmpw = w - 4;
asm volatile(
- "addze 7,7;" /* clear carry */
- "mtctr %1;" /* load loop count */
+ "addic 0,0,0;" /* clear carry */
+ "mtctr %2;" /* load loop count */
"1:"
- "lwzu 7,4(%2);" /* load current data word */
- "lwzu 8,4(%2);"
- "lwzu 9,4(%2);"
- "lwzu 10,4(%2);"
+ "lwz 7,0(%1);" /* load current data word */
+ "lwz 8,4(%1);"
+ "lwz 9,8(%1);"
+ "lwz 10,12(%1);"
"adde %0,%0,7;" /* add to sum */
"adde %0,%0,8;"
"adde %0,%0,9;"
"adde %0,%0,10;"
- "lwzu 7,4(%2);"
- "lwzu 8,4(%2);"
- "lwzu 9,4(%2);"
- "lwzu 10,4(%2);"
+ "lwz 7,16(%1);"
+ "lwz 8,20(%1);"
+ "lwz 9,24(%1);"
+ "lwz 10,28(%1);"
"adde %0,%0,7;"
"adde %0,%0,8;"
"adde %0,%0,9;"
"adde %0,%0,10;"
- "lwzu 7,4(%2);"
- "lwzu 8,4(%2);"
- "lwzu 9,4(%2);"
- "lwzu 10,4(%2);"
+ "lwz 7,32(%1);"
+ "lwz 8,36(%1);"
+ "lwz 9,40(%1);"
+ "lwz 10,44(%1);"
"adde %0,%0,7;"
"adde %0,%0,8;"
"adde %0,%0,9;"
"adde %0,%0,10;"
- "lwzu 7,4(%2);"
- "lwzu 8,4(%2);"
- "lwzu 9,4(%2);"
- "lwzu 10,4(%2);"
+ "lwz 7,48(%1);"
+ "lwz 8,52(%1);"
+ "lwz 9,56(%1);"
+ "lwz 10,60(%1);"
"adde %0,%0,7;"
"adde %0,%0,8;"
"adde %0,%0,9;"
"adde %0,%0,10;"
+ "addi %1,%1,64;"
"bdnz 1b;" /* loop */
"addze %0,%0;" /* add carry bit */
- : "+r"(sum)
- : "r"(n), "r"(tmpw)
+ : "+r"(sum), "+b"(w)
+ : "r"(n)
: "7", "8", "9", "10"); /* clobber r7, r8, r9, r10 */
- w += n * 64;
mlen -= n * 64;
}
if (mlen >= 8) {
register int n __asm("r0");
- uint8_t *tmpw;
n = mlen >> 3;
- tmpw = w - 4;
asm volatile(
- "addze %1,%1;" /* clear carry */
- "mtctr %1;" /* load loop count */
+ "addic 0,0,0;" /* clear carry */
+ "mtctr %2;" /* load loop count */
"1:"
- "lwzu 7,4(%2);" /* load current data word */
- "lwzu 8,4(%2);"
+ "lwz 7,0(%1);" /* load current data
+ word */
+ "lwz 8,4(%1);"
"adde %0,%0,7;" /* add to sum */
"adde %0,%0,8;"
+ "addi %1,%1,8;"
"bdnz 1b;" /* loop */
"addze %0,%0;" /* add carry bit */
- : "+r"(sum)
- : "r"(n), "r"(tmpw)
+ : "+r"(sum), "+b"(w)
+ : "r"(n)
: "7", "8"); /* clobber r7, r8 */
- w += n * 8;
mlen -= n * 8;
}