/*********************************************

  CPE.cpp
  
  Cost Per Execution
  
  A class designed for extending.  Used for testing the actuall number of
  instructions executed for a given sample of code.

  This class makes use of self-modifying code to access x86 instructions in a 
  compiler independant manner (i.e., does not have any embedded assembler).
  This technique is based on parts of the book "Computer Systems: A Programmer's 
  Perspective" by Randal E. Bryant and David R. O'Hallaron, information on which 
  can be found at http://csapp.cs.cmu.edu/.  Sean Larsson originally adapted the 
  examples in the book and Keith Oxenrider took Sean's code, rewrote it to use
  self-modifying code and adapted it into a class.  This program should run fine
  on any x86 instruction based computer (Intel, AMD, Cyrix), but has only
  been tested on Intel.  If you get results for other machines (good or bad)
  we would very much appreciate knowing about them, particularlly if you had
  to modify the program to get it to run properly.

  This code is placed in the public domain for use by anyone for anything.  We 
  ask that if you make use of this code that you acknowledge us as authors.

  Sean wrote his version 12/25/2003, Keith adapted it on 06/22/2004.

  The most recent version of this program can be found via:
    sol-biotech.com/code/

  Keith can be contacted at koxenrider[at]sol[dash]biotech[dot]com
  Sean can be contacted at infamous41md[at]sol[dash]biotech[dot]com

 *********************************************/

#include "CPE.hpp"

CCPE::CCPE(){
    int intReadStampPtr=0;

    m_ulMinOverhead=m_ulMinExecution=0;

    writeGetOverhead();
    intReadStampPtr = m_vctInstructions.size();
    writeReadStamp();

/****************************************
    NOTE for those interested in extending/modifying this program:
    Do NOT set the function pointers until AFTER you have added all
    the instructions!  It is possible that the addition of the instructions
    to the vector will cause the vector to relocate the memory, which
    would invalidate any pointers you set.
****************************************/

    GetOverhead = (FunctionUIntPtrInt) &m_vctInstructions[0];
    ReadStamp = (FunctionUIntPtr) &m_vctInstructions[intReadStampPtr];
}

/* FYI:  this is the original Visual C++ assembler
void read_stamp(unsigned int *arg) {
    unsigned int    hi = 0, lo = 0;
    _asm
    {
        read_stamp:
        cpuid
            rdtsc
            mov     [hi], edx
            mov     [lo], eax
    }
    *arg = hi;
    *(arg + 1) = lo;
}
*/
void CCPE::writeReadStamp() {

    m_vctInstructions.push_back(0x55);// push ebp
    m_vctInstructions.push_back(0x8B);// mov ebp,esp
    m_vctInstructions.push_back(0xEC);
    m_vctInstructions.push_back(0x83);// sub esp,8
    m_vctInstructions.push_back(0xEC);
    m_vctInstructions.push_back(0x08);
    m_vctInstructions.push_back(0x33);// xor eax,eax
    m_vctInstructions.push_back(0xC0);
    m_vctInstructions.push_back(0x53);// push ebx
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-4],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-8],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x0F);// cpuid
    m_vctInstructions.push_back(0xA2);
    m_vctInstructions.push_back(0x0F);// rdtsc
    m_vctInstructions.push_back(0x31);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-4],edx
    m_vctInstructions.push_back(0x55);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-8],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x8B);// mov eax,dword ptr [ebp+8]
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0x08);
    m_vctInstructions.push_back(0x8B);// mov ecx,dword ptr [ebp-4]
    m_vctInstructions.push_back(0x4D);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x8B);// mov edx,dword ptr [ebp-8]
    m_vctInstructions.push_back(0x55);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x89);// mov dword ptr [eax],ecx
    m_vctInstructions.push_back(0x08);
    m_vctInstructions.push_back(0x89);// mov dword ptr [eax+4],edx
    m_vctInstructions.push_back(0x50);
    m_vctInstructions.push_back(0x04);
    m_vctInstructions.push_back(0x5B);// pop ebx
    m_vctInstructions.push_back(0x8B);// mov esp,ebp
    m_vctInstructions.push_back(0xE5);
    m_vctInstructions.push_back(0x5D);// pop ebp
    m_vctInstructions.push_back(0xC3);// ret

}


/* FYI:  this is the original Visual C++ assembler
void get_overhead(unsigned int *ov, int dummy) {
    unsigned int    temp = 0, temp2 = 0;
    _asm
    {
        get_overhead:
        cpuid
        rdtsc
        mov     [temp], eax
        mov     [dummy], eax    ; //here is the dummy
        mov     [dummy], edx    ;     //DUMMY
        mov     eax, [dummy]    ;     //DUMMY
        cpuid
        rdtsc
        mov     [temp2], eax
    }
    *ov = temp2 - temp;
}
*/

void CCPE::writeGetOverhead() {
    m_vctInstructions.push_back(0x55);// push ebp
    m_vctInstructions.push_back(0x8B);// mov ebp,esp
    m_vctInstructions.push_back(0xEC);
    m_vctInstructions.push_back(0x83);// sub esp,8
    m_vctInstructions.push_back(0xEC);
    m_vctInstructions.push_back(0x08);
    m_vctInstructions.push_back(0x33);// xor eax,eax
    m_vctInstructions.push_back(0xC0);
    m_vctInstructions.push_back(0x53);// push ebx
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-8],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-4],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x0F);// cpuid
    m_vctInstructions.push_back(0xA2);
    m_vctInstructions.push_back(0x0F);// rdtsc
    m_vctInstructions.push_back(0x31);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-8],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp+0Ch],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0x0C);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp+0Ch],edx
    m_vctInstructions.push_back(0x55);
    m_vctInstructions.push_back(0x0C);
    m_vctInstructions.push_back(0x8B);// mov eax,dword ptr [ebp+0Ch]
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0x0C);
    m_vctInstructions.push_back(0x0F);// cpuid
    m_vctInstructions.push_back(0xA2);
    m_vctInstructions.push_back(0x0F);// rdtsc
    m_vctInstructions.push_back(0x31);
    m_vctInstructions.push_back(0x89);// mov dword ptr [ebp-4],eax
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x8B);// mov eax,dword ptr [ebp-4]
    m_vctInstructions.push_back(0x45);
    m_vctInstructions.push_back(0xFC);
    m_vctInstructions.push_back(0x8B);// mov ecx,dword ptr [ebp-8]
    m_vctInstructions.push_back(0x4D);
    m_vctInstructions.push_back(0xF8);
    m_vctInstructions.push_back(0x2B);// sub eax,ecx
    m_vctInstructions.push_back(0xC1);
    m_vctInstructions.push_back(0x8B);// mov ecx,dword ptr [ebp+8]
    m_vctInstructions.push_back(0x4D);
    m_vctInstructions.push_back(0x08);
    m_vctInstructions.push_back(0x5B);// pop ebx
    m_vctInstructions.push_back(0x89);// mov dword ptr [ecx],eax
    m_vctInstructions.push_back(0x01);
    m_vctInstructions.push_back(0x8B);// mov esp,ebp
    m_vctInstructions.push_back(0xE5);
    m_vctInstructions.push_back(0x5D);// pop ebp
    m_vctInstructions.push_back(0xC3);// ret
}


//    this function calls a user function passing
//    it the arg passed in from main.  the idea is
//    to use the void * to build yourself a structure
//    with all the args you need packed into it. it also
//    returns a void pointer so you are free to return
//    whatever you want back to main as well.
//    the first arg is a function pointer
//    the second is the arg to that function
void * CCPE::test_it(void * (*your_func)(void *), void *your_arg, 
                     unsigned int NRUNS) {
    unsigned int i, dummy=0;
    unsigned int oldval[2], newval[2], diff = 0, overhead;
    void *data = NULL;

    m_ulMinOverhead=m_ulMinExecution=0;

    //get the minimum overhead time (minimum because there are plenty of 
    //circumstances where the result can vary due to machine load)
    for(i=0; i<NRUNS; i++) {
        GetOverhead(&overhead, dummy);
        if (!i) m_ulMinOverhead = overhead;
        else{
            if (overhead < m_ulMinOverhead) m_ulMinOverhead = overhead;
        }
    }

    //should be run at least 3 times to get code/data into L1 cache
    //if you want to strictly test the performance of the function
    for(i=0; i<NRUNS; i++) {
        ReadStamp(oldval);//read full 64 bit timestamp

        //call function. I (Sean) noticed a tiny bit of overhead
        //with this calling scheme, not sure why exactly.
        data = (*your_func)(your_arg);

        ReadStamp(newval);//read full 64 bit new timestamp

        //calc difference between stamps, retain minimum
        diff = (newval[0] - oldval[0]) + (newval[1] - oldval[1]) - m_ulMinOverhead;
        if (!i) m_ulMinExecution = diff;
        else{
            if (diff < m_ulMinExecution) m_ulMinExecution = diff;
        }
    }

//your return data gets returned on the final iteration only
    return data;
}

