[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
memmove,memset
鈴木(康)です。
mgl2 のチューニングの過程で、memmove と memset がとても遅いという
ことに気が付きました。( ちなみに memcpy は builtin 版)
memcpy : 11179 k byte/sec
memmove_fwd : 1751 k byte/sec
memmove_back : 1748 k byte/sec
memset : 2627 k byte/sec
xmemset : 17367 k byte/sec
xmemmove_fwd : 10440 k byte/sec
xmemmove_back: 10423 k byte/sec
ベンチマークプログラム (と mgl で使おうと思っている C 版)を
添付します。
--- ここから
#include <stdio.h>
#include <time.h>
#include <sys/types.h>
#include <sys/time.h>
static long long millitime(void) {
struct timeval n;
gettimeofday(&n,0);
return (n.tv_sec *1000 + n.tv_usec/1000);
}
void *
mgl_memset(void *dst, int data, size_t bytes) {
char *dst1=dst;
int *dst4;
int bytes4;
int bytes32;
int data32;
while (((int)dst1 & 0x3) && bytes) {
*dst1++ = data;
bytes--;
}
if (!bytes) return dst;
dst4 = (int *)dst1;
bytes4 = (bytes /4);
bytes32 = (bytes4 /8);
bytes -= bytes4 * 4;
bytes4 -= bytes32 * 8;
data32 = 0x01010101 * (data & 0xff);
while (bytes32--) {
dst4[0] = dst4[1] = dst4[2] = dst4[3] =
dst4[4] = dst4[5] = dst4[6] = dst4[7] = data32;
dst4 += 8;
}
while (bytes4--) {
*dst4++ = data32;
}
dst1 = (char *)dst4;
while (bytes--) {
*dst1++ = data;
}
return dst;
}
void *
mgl_memmove(void *dst,const void *src, size_t bytes) {
if (dst > src) {
char *dst1=dst+bytes;
const char *src1=src+bytes;
while (((int)dst1 & 0x3) && bytes--) {
*(--dst1) = *(--src1);
}
if (!((int)src1 & 0x3)) {
int bytes4;
int bytes32;
int *dst4=(int *)dst1;
const int *src4=(int *)src1;
bytes4 = (bytes /4);
bytes32 = (bytes4 /8);
bytes -= bytes4 * 4;
bytes4 -= bytes32 * 8;
while (bytes32--) {
dst4 -= 8;
src4 -= 8;
dst4[0] = src4[0];
dst4[1] = src4[1];
dst4[2] = src4[2];
dst4[3] = src4[3];
dst4[4] = src4[4];
dst4[5] = src4[5];
dst4[6] = src4[6];
dst4[7] = src4[7];
}
while (bytes4--) {
*(--dst4) = *(--src4);
}
dst1 = (char *)dst4;
src1 = (char *)src4;
}
while (bytes--) {
*(--dst1) = *(--src1);
}
} else {
char *dst1=dst;
const char *src1=src;
while (((int)dst1 & 0x3) && bytes--) {
*dst1++ = *src1++;
}
if (!((int)src1 & 0x3)) {
int bytes4;
int bytes32;
int *dst4=(int *)dst1;
const int *src4=(int *)src1;
bytes4 = (bytes /4);
bytes32 = (bytes4 /8);
bytes -= bytes4 * 4;
bytes4 -= bytes32 * 8;
while (bytes32--) {
dst4[0] = src4[0];
dst4[1] = src4[1];
dst4[2] = src4[2];
dst4[3] = src4[3];
dst4[4] = src4[4];
dst4[5] = src4[5];
dst4[6] = src4[6];
dst4[7] = src4[7];
dst4 += 8;
src4 += 8;
}
while (bytes4--) {
*dst4++ = *src4++;
}
dst1 = (char *)dst4;
src1 = (char *)src4;
}
while (bytes--) {
*dst1++ = *src1++;
}
}
return dst;
}
char buf1[1000*100];
char buf2[1000*100];
#define LOOP_COUNT 1000
main(int argc,char *argv[]) {
char *p;
int pen_color = argc;
int i,j,k;
long long s,e;
int dps;
int type;
char *b1,*b2;
for (type = 0 ; type < 7; type++) {
s = millitime();
for (i=0; i< LOOP_COUNT; i++) {
b1 = buf1;
b2 = buf2;
if (type == 0) {
for (j=0; j<100; j++) {
memcpy(b1,b2,1000);
b1+=1000;
b2+=1000;
}
} else if (type == 1) {
for (j=0; j<100; j++) {
memmove(b1,b2,1000);
b1+=1000;
b2+=1000;
}
} else if (type == 2) {
for (j=0; j<100; j++) {
memmove(b2,b1,1000);
b1+=1000;
b2+=1000;
}
} else if (type == 3) {
for (j=0; j<100; j++) {
memset(b1,0,1000);
b1+=1000;
}
} else if (type == 4) {
for (j=0; j<100; j++) {
mgl_memset(b1,0,1000);
b1+=1000;
}
} else if (type == 5) {
for (j=0; j<100; j++) {
mgl_memmove(b1,b2,1000);
b1+=1000;
b2+=1000;
}
} else if (type == 6) {
for (j=0; j<100; j++) {
mgl_memmove(b2,b1,1000);
b1+=1000;
b2+=1000;
}
}
}
e = millitime();
sleep(1); /* wait for flush cache */
if (e == s) {
printf("error\n");
} else {
dps = LOOP_COUNT * 100 * 1000 /(int)(e - s);
printf("%s: %6d k byte/sec\n"
,(type==0)?"memcpy "
:(type==1)?"memmove_fwd "
:(type==2)?"memmove_back "
:(type==3)?"memset "
:(type==4)?"xmemset "
:(type==5)?"xmemmove_fwd "
:(type==6)?"xmemmove_back"
:"?"
,dps);
}
}
}
--- ここまで
--
鈴木 康司 @NEC
suz@hpc.bs1.fc.nec.co.jp
TEL 0423-33-5381