/*
 heavily modified and enhanced version of memxfer5b.c(?) by (???)

  by Joerg Arndt (arndt (AT) jjj.de)


 CHANGES:
 - removed Windows specific stuff
 - let megabyte be 2^20 (was:10^6)
 - let option -p be default
 - added makefile and cleaned up the code

 TODO:
 - optionally use prefetches for long* and double*
 - comment the streaming code

gcc -Wall -O2 -ffast-math -funroll-loops -fomit-frame-pointer  -DPAGE_SIZE=131072 \
 memxferjj.cc -o memxferjj

gcc -DATHLON -Wall -O2 -ffast-math -fomit-frame-pointer -fno-exceptions \
  memxferjj.cc -o  memxferjj

for alpha:  -mcpu=ev67


for m in $(seq 0 12); do \
   nice -n -20 ./memxferjj -s 32M 20 $m ; done

for s in 1 2 4 8 16 32 64 128 256 512 1024 2048 ; do \
   nice -n -20 ./memxferjj -s ${s}k 100 6 ; done

*/

#define int64 long long
#include <sys/time.h>
#include <sys/types.h>
#include <string.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <malloc.h>

typedef unsigned long ulong;
size_t atoik(char *);
void *Malloc(size_t sz);
void tstart(void);
void tend(void);
double tval(void);

#define PROGNAME  argv[0]

#ifndef PAGE_SIZE
# define PAGE_SIZE  4096
#endif

#ifdef  ATHLON
static void * fpu_context = 0;
void movntq_clear(void* adr, long nbytes);
void movntq_copy(void* src, void* dst, long nbytes);
void movntq_copy_p(void* src, void* dst, long nbytes);
void movntq_clear_page(void* adr);
void movntq_copy_page(void* src, void* dst);
void movntq_copy_page_p(void* src, void* dst);

void alloc_fpu_context()
{
    fpu_context = (void*)memalign(256, 1024);
}

void fpu_save()
{
    asm volatile ("\n\t femms");
//    asm volatile (
//                  "\n\t   fxsave (%0)" // save context
//                  "\n\t   fnclex"  // clear exceptions
//                  : : "r" (fpu_context) : "memory");
}

void fpu_restore()
{
    asm volatile ("\n\t femms");
//    asm volatile ( "\n\t   fxrstor (%0) \n" : : "r" (fpu_context) : "memory");
}

#endif // def  ATHLON


int main(int argc, char *argv[])
{
    double tottim = 0.0;

    int m = 9; // method
    int fflag = 0;  // if 0, then just Malloc once; else malloc/free each time
    int sflag = 0;  // if 1, only print averages.
    int pflag = 1;  // default is 1

    char *methods[] =
    {
        "\"memcpy\"", // 0
        "\"char *\"", // 1
        "\"short *\"", // 2
        "\"int *\"", // 3
        "\"long *\"", // 4 
        "\"long * (4x unrolled)\"", // 5
        "\"int64 *\"", // 6
        "\"double *\"", // 7
        "\"double * (4x unrolled)\"", // 8
#ifdef  ATHLON
//#warning 'FYI: ATHLON specific methods included'
        "\"streaming K7\"", // 9
        "\"streaming K7 prefetch\"", // 10
        "\"streaming K7 clear\"", // 11
        "\"long * clear\"", // 12
#endif // ATHLON
    };
    int nm = sizeof(methods)/sizeof(methods[0]);


    int ac = argc;
    char **av = argv;
#define  ARGEQ(str, var) if( !strcmp(av[1], str))  { ac--; av++; var = 1; continue; }
#define  ARG0(str, var)  if( !strcmp(av[1], str))  { ac--; av++; var = 0; continue; }
    while ( ac > 1 )
    {
        ARGEQ("-f", fflag);
        ARGEQ("-s", sflag);
        ARGEQ("-p", pflag);
        ARG0 ("+p", pflag);
//        printf("Invalid argument: %s\n", av[1]); return 1;
        break;
    }

    if ( ac < 3 )
    {
        printf("Usage: %s [-f] [-s] [+p] size cnt [method [method2...]]\n", PROGNAME);
        printf("\t-f flag says to malloc and free of the \"cnt\" times.\n");
        printf("\t-s = silent; only print averages\n");
        printf("\t+p = no prep\n");

        printf("\tmethods:\n");
        for (int i = 0; i < nm; i++)  printf("\t%2d:\t%s\n",i,methods[i]);
        return 0;
    }

    size_t size = atoik(av[1]);

    // Round size up to 4*sizeof(double) bytes.
#define DS4 (4*sizeof(double))
    if ( size != ((size/DS4)*(DS4)) )
    {
        size += (DS4);
        size /= (DS4);
        size *= (DS4);
    }

    ulong cnt = (ulong)atoik(av[2]);

    char *p1 = 0, *p2 = 0;
    if ( fflag == 0 )
    {
        p1 = (char *)Malloc(size);
        p2 = (char *)Malloc(size);
        if ( pflag )  memcpy(p1,p2,size);
    }

    if ( ac == 3 )
    {
        ac = 4;
        av[3] = "0";
    }

#ifdef  ATHLON
    alloc_fpu_context ();
#endif // def  ATHLON

    for ( ; ac > 3; ac--, av++)
    {
        if ( isdigit(*av[3]) )  m = atoi(av[3]);
        if ( (m < 0) || (m >= nm) )  { printf("invalid method !\n"); return 1; }

        if ( sflag )  tstart();

        for (ulong ui = 0; ui < cnt; ui++)
        {
            if ( !sflag )
            {
                printf("% 10ld", size);
                printf(" % 3ld", cnt);
                printf(" %-25.25s", methods[m]);
                printf("\t");
                tstart();
            }

            if ( fflag == 1 )
            {
                p1 = (char *)Malloc(size);
                p2 = (char *)Malloc(size);
            }

            switch ( m )
            {
            case 0:  // memcpy()
                {
                    (void)memcpy(p1, p2, size);
                    break;
                }

            case 1: // char
                {
                    char *dst = p1;
                    char *src = p2;
                    for (size_t j=0; j<size; j++)  *dst++ = *src++;
                    break;
                }

            case 2: // short
                {
                    short *dst = (short *)p1;
                    short *src = (short *)p2;
                    for (size_t j=0; j<size; j += sizeof(short))  *dst++ = *src++;
                    break;
                }

            case 3: // int
                {
                    int *dst = (int *)p1;
                    int *src = (int *)p2;
                    for (size_t j=0; j<size; j += sizeof(int))  *dst++ = *src++;
                    break;
                }

            case 4: // long
                {
                    long *dst = (long *)p1;
                    long *src = (long *)p2;
                    for (size_t j=0; j<size; j += sizeof(long))
                    {
//                        asm volatile ("\n\t  prefetch 320(%0)" : : "r" (src) );
                        *dst++ = *src++;
                    }

                    break;
                }

            case 5: // long unrolled
                {
                    long *dst = (long *)p1;
                    long *src = (long *)p2;
//                    asm volatile (
//                                 "\n\t  prefetch 0(%0)"
//                                 "\n\t  prefetch 64(%0)"
//                                 "\n\t  prefetch 128(%0)"
//                                 "\n\t  prefetch 192(%0)"
//                                 "\n\t  prefetch 256(%0)"
//                                 : : "r" (src) );
                    for (size_t j=0; j<size; j += 4*sizeof(long))
                    {
//                        asm volatile ("\n\t  prefetch 320(%0)" : : "r" (src) );
                        *dst++ = *src++;
                        *dst++ = *src++;
                        *dst++ = *src++;
                        *dst++ = *src++;
                    }
                    break;
                }

            case 6:  // int64
                {
                    int64 *dst = (int64 *)p1;
                    int64 *src = (int64 *)p2;
                    for (size_t j=0; j<size; j += sizeof(int64))  *dst++ = *src++;
                    break;
                }

            case 7: // double
                {
                    double *dst = (double *)p1;
                    double *src = (double *)p2;

                    for (size_t j=0; j<size; j += sizeof(double))  *dst++ = *src++;
                    // --> approx 500 MB/sec

//                    for (size_t j=0; j<size/sizeof(double); ++j)  dst[j] = src[j];
                    // --> approx 400 MB/sec

//                    unsigned i = size/sizeof(double);
//                    dst += i; while ( i-- )  *--dst = *src++;
                    // --> approx 520 MB/sec

                    break;
                }

            case 8: // double unrolled
                {
                    double *dst = (double *)p1;
                    double *src = (double *)p2;
//                    asm volatile (
//                                 "\n\t  prefetch 0(%0)"
//                                 "\n\t  prefetch 64(%0)"
//                                 "\n\t  prefetch 128(%0)"
//                                 "\n\t  prefetch 192(%0)"
//                                 "\n\t  prefetch 256(%0)"
//                                 : : "r" (src) );
                    for (size_t j=0; j<size; j += 4*sizeof(double))
                    {
//                        asm volatile ("\n\t  prefetch 320(%0)" : : "r" (src) );
                        // with prefetch 550 MB/sec, else 500 MB/sec
                        *dst++ = *src++;
                        *dst++ = *src++;
                        *dst++ = *src++;
                        *dst++ = *src++;
                    }
                    break;
                }

#ifdef  ATHLON
            case 9: // streaming
                {
                    char *dst = p1;
                    char *src = p2;
                    movntq_copy(src, dst, size);
                    break;
                }

            case 10: // streaming with prefetch
                {
                    char *dst = p1;
                    char *src = p2;
//                    for (size_t j=0; j<size; j+=PAGE_SIZE)
//                    {
//                        movntq_copy_page_p(src, dst);
//                        dst += PAGE_SIZE;
//                        src += PAGE_SIZE;
//                    }
//                    fpu_restore();

                    movntq_copy_p(src, dst, size);

                    break;
                }

            case 11: // streaming clear
                {
                    char *dst = p1;
                    movntq_clear(dst, size);
                    break;
                }

            case 12: // clear long unrolled
                {
                    long *dst = (long *)p1;
                    long src = 0xdeadbabe;
                    for (size_t j=0; j<size; j += 4*sizeof(long))
                    {
                        *dst++ = src;
                        *dst++ = src;
                        *dst++ = src;
                        *dst++ = src;
                    }
                    break;
                }


#endif // def ATHLON
            }

            if(fflag == 1)
            {
                free(p1);
                free(p2);
            }

            if(!sflag)
            {
                tend();
                double t = tval();
                tottim += t;
                if(t == 0.0)  t = .0001;
                printf(" %8.6f seconds %8.3f MB/s\n",
                       t, (double)size/t/(1.0*1024*1024));
            }
        }

        if ( sflag )
        {
            tend();
            tottim = tval();
        }

//        if ( csvflag )
//        {
//            printf("%s",methods[m]);
//            printf(",%ld",size);
//            printf(",%ld",size*cnt);
//            printf(",%8.3f",tottim);
//            printf(",%8.3f",(double)size/(tottim/cnt)/(1.0*1024*1024));
//            printf("\n");
//        }
//        else
        {
            printf("avg: % 10ld  [%2d]%-25.25s  ", size, m, methods[m]);
            printf("%8.3f MB/s\n", (double)size/(tottim/cnt)/(1.0*1024*1024));
            if ( 0==sflag )  printf("\n");
        }
        tottim = 0.0;
    }

    return 0;
}
//-------------------------------------------

#include <ctype.h>

size_t atoik(char *s)
{
    size_t ret = 0;
    size_t base;

    if (*s == '0')
    {
        base = 8;
        if(*++s == 'x' || *s == 'X')
        {
            base = 16;
            s++;
        }
    }
    else  base = 10;

    for (; isxdigit(*s); s++)
    {
        if (base == 16)
        {
            if (isalpha(*s))  ret = base*ret + (toupper(*s) - 'A');
            else              ret = base*ret + (*s - '0');
        }
        else
        {
            if (isdigit(*s))  ret = base*ret + (*s - '0');
            else  break;
        }
    }

    for (; isalpha(*s); s++)
    {
        switch ( toupper(*s) )
        {
        case 'k':
        case 'K': ret *= 1024; break;
        case 'M': ret *= 1024*1024; break;
        case 'G': ret *= 1024*1024*1024; break;
        default:
            return ret;
        }
    }

    return ret;
}
//-------------------------------------------

void *Malloc(size_t sz)
{
    char *p = (char *)malloc(sz);
    if(p == NULL)
    {
        printf("malloc(%ld) failed\n", sz);
        exit(1);
    }
    return (void *)p;
}
//-------------------------------------------

static struct timeval _tstart, _tend;

void tstart(void)
{
    gettimeofday(&_tstart, NULL);
}
//-------------------------------------------

void tend(void)
{
    gettimeofday(&_tend, NULL);
}
//-------------------------------------------

double tval()
{
    double t1, t2;
    t1 =  (double)_tstart.tv_sec + (double)_tstart.tv_usec * 1e-6;
    t2 =  (double)_tend.tv_sec + (double)_tend.tv_usec * 1e-6;
    return t2-t1;
}
//-------------------------------------------


#ifdef  ATHLON

void
movntq_clear(void* dst, long nbytes)
{
    char *d = (char *)dst;
    asm volatile ( "\n\t pxor %%mm0, %%mm0" :  : "i" (0) );

//    unsigned long long u = 0xabadcafedeadbabeULL;
//    asm volatile ( "\n\t   movq (%0), %%mm0" : : "r" (&u) );

#define  NCHUNKS  2  // 1, 2, 3 or 4
#define  NBYTES  (8*NCHUNKS)
    while ( nbytes>=NBYTES )
    {
        nbytes -= NBYTES;

        asm volatile (
                      "\n\t   movntq %%mm0, (%0)"
#if  NCHUNKS > 1
                      "\n\t   movntq %%mm0, 8(%0)"
#if  NCHUNKS > 2
                      "\n\t   movntq %%mm0, 16(%0)"
#if  NCHUNKS > 3
                      "\n\t   movntq %%mm0, 24(%0)"
#endif
#endif
#endif
//                      "\n\t   movntq %%mm0, 32(%0)"
//                      "\n\t   movntq %%mm0, 40(%0)"
//                      "\n\t   movntq %%mm0, 48(%0)"
//                      "\n\t   movntq %%mm0, 56(%0)"
                      : : "r" (d) : "memory");
        d += NBYTES;
    }

    while ( nbytes-- )  { d[nbytes] = 0; }

    asm volatile ("\n\t sfence");
    asm volatile ("\n\t femms");
#undef  NCHUNKS
}
//-------------------------------------------


void
movntq_copy(void* src, void* dst, long nbytes)
{
    char *s = (char *)src,  *d = (char *)dst;

#define  NCHUNKS  2  // 1, 2, 3 or 4
#define  NBYTES  (8*NCHUNKS)
    while ( nbytes>=NBYTES )
    {
        nbytes -= NBYTES;
        asm volatile (
                      "\n\t   movq (%0),%%mm0"
                      "\n\t   movntq %%mm0,(%1)"
#if NCHUNKS > 1
                      "\n\t   movq 8(%0),%%mm1"
                      "\n\t   movntq %%mm1,8(%1)"
#if NCHUNKS > 2
                      "\n\t   movq 16(%0),%%mm2"
                      "\n\t   movntq %%mm2,16(%1)"
#if NCHUNKS > 3
                      "\n\t   movq 24(%0),%%mm3"
                      "\n\t   movntq %%mm3,24(%1)"
#endif
#endif
#endif
                      :
                      : "r" (s), "r" (d)
                      : "memory" );

        s += NBYTES;
        d += NBYTES;
    }

    while ( nbytes-- )  { d[nbytes] = s[nbytes]; }

    asm volatile ("\n\t sfence");
    asm volatile ("\n\t femms");
#undef  NCHUNKS
}
//-------------------------------------------

void
movntq_copy_p(void* src, void* dst, long nbytes)
{
    char *s = (char *)src,  *d = (char *)dst;

#define  NCHUNKS  4  // 4(default) or 8
#define  NBYTES  (8*NCHUNKS)
    asm volatile (
                  "\n\t  prefetch 0(%0)"
                  "\n\t  prefetch 64(%0)"
                  "\n\t  prefetch 128(%0)"
                  "\n\t  prefetch 192(%0)"
                  "\n\t  prefetch 256(%0)"
                  : : "r" (s) );

    while ( nbytes>=NBYTES )
    {
        asm volatile ("\n\t  prefetch 320(%0)" : : "r" (s) );
        
        nbytes -= NBYTES;
        asm volatile (
                      "\n\t   movq (%0),%%mm0"
                      "\n\t   movntq %%mm0,(%1)"

                      "\n\t   movq 8(%0),%%mm1"
                      "\n\t   movntq %%mm1,8(%1)"

                      "\n\t   movq 16(%0),%%mm2"
                      "\n\t   movntq %%mm2,16(%1)"

                      "\n\t   movq 24(%0),%%mm3"
                      "\n\t   movntq %%mm3,24(%1)"
#if NCHUNKS > 4
                      "\n\t   movq 32(%0),%%mm0"
                      "\n\t   movntq %%mm0,32(%1)"

                      "\n\t   movq 40(%0),%%mm1"
                      "\n\t   movntq %%mm1,40(%1)"

                      "\n\t   movq 48(%0),%%mm2"
                      "\n\t   movntq %%mm2,48(%1)"

                      "\n\t   movq 56(%0),%%mm3"
                      "\n\t   movntq %%mm3,56(%1)"
#endif
                      :
                      : "r" (s), "r" (d)
                      : "memory" );

        s += NBYTES;
        d += NBYTES;
    }

    while ( nbytes-- )  { d[nbytes] = s[nbytes]; }

    asm volatile ("\n\t sfence");
    asm volatile ("\n\t femms");
}
//-------------------------------------------


static inline void
move_64(void *src, void *dst)
{
    asm volatile (
                  "\n\t   movq (%0),%%mm0"
                  "\n\t   movntq %%mm0,(%1)"
                  "\n\t   movq 8(%0),%%mm1"
                  "\n\t   movntq %%mm1,8(%1)"
                  "\n\t   movq 16(%0),%%mm2"
                  "\n\t   movntq %%mm2,16(%1)"
                  "\n\t   movq 24(%0),%%mm3"
                  "\n\t   movntq %%mm3,24(%1)"
                  "\n\t   movq 32(%0),%%mm4"
                  "\n\t   movntq %%mm4,32(%1)"
                  "\n\t   movq 40(%0),%%mm5"
                  "\n\t   movntq %%mm5,40(%1)"
                  "\n\t   movq 48(%0),%%mm6"
                  "\n\t   movntq %%mm6,48(%1)"
                  "\n\t   movq 56(%0),%%mm7"
                  "\n\t   movntq %%mm7,56(%1)"
                  :
                  : "r" (src), "r" (dst)
                  : "memory" );
}
//-------------------------------------------

void
movntq_copy_page(void* src, void* dst)
{
    // This assignment makes no sense other
    // than satisfy gcc 4.x' stupid rules.
    char *csrc=(char *)src;
    char *cdst=(char *)dst;

    for(int k=0; k<PAGE_SIZE/64; ++k)
    {
        move_64(csrc, cdst);
        csrc += 64;
        cdst += 64;
    }
    asm volatile ("\n\t   sfence" : : );
}
//-------------------------------------------


void
movntq_copy_page_p(void* src, void* dst)
{
    // This assignment makes no sense other
    // than satisfy gcc 4.x' stupid rules.
    char *csrc=(char *)src;
    char *cdst=(char *)dst;

    asm volatile (
                  "\n\t   prefetch (%0)"
                  "\n\t   prefetch 64(%0)"
                  "\n\t   prefetch 128(%0)"
                  "\n\t   prefetch 192(%0)"
                  "\n\t   prefetch 256(%0)"
                  : : "r" (csrc) );

    for(int k=0; k<PAGE_SIZE/64-4; ++k)
    {
        asm volatile ( "\n\t   prefetch 320(%0)" : : "r" (csrc) );

        move_64(csrc, cdst);
        csrc += 64;
        cdst += 64;
    }
    // prefetch distance = 320-64 = 256 = 4*64

    for(int k=0; k<4; ++k)
    {
        move_64(csrc, cdst);
        csrc += 64;
        cdst += 64;
    }

    asm volatile ("\n\t   sfence" : : );
}
//-------------------------------------------


void movntq_clear_page(void* adr)
{
    char *cadr=(char *)adr;

    asm volatile ( "\n\t   pxor %%mm0, %%mm0" :  : "i" (0) );

//    unsigned long long u = 0xabadcafedeadbabeULL;
//    asm volatile ( "\n\t   movq (%0), %%mm0" : : "r" (&u) );

    for(int k=0; k<PAGE_SIZE/64; ++k)
    {
        asm volatile (
                      "\n\t   movntq %%mm0, (%0)"
                      "\n\t   movntq %%mm0, 8(%0)"
                      "\n\t   movntq %%mm0, 16(%0)"
                      "\n\t   movntq %%mm0, 24(%0)"
                      "\n\t   movntq %%mm0, 32(%0)"
                      "\n\t   movntq %%mm0, 40(%0)"
                      "\n\t   movntq %%mm0, 48(%0)"
                      "\n\t   movntq %%mm0, 56(%0)"
                      : : "r" (cadr) : "memory");
        cadr += 64;
    }
    asm volatile ("\n\t   sfence");
}
//-------------------------------------------

#endif // def  ATHLON

