netcpu_looper.c - platform/external/netperf - Git at Google

 char   netcpu_looper_id[]="\
 @(#)netcpu_looper.c (c) Copyright 2005-2007. Version 2.4.3";

 /* netcpu_looper.c

    Implement the soaker process specific portions of netperf CPU
    utilization measurements. These are broken-out into a separate file
    to make life much nicer over in netlib.c which had become a maze of
    twisty, CPU-util-related, #ifdefs, all different.  raj 2005-01-26
    */

 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif

 #include <stdio.h>

 #ifdef HAVE_FCNTL_H
 # include <fcntl.h>
 #endif
 #if HAVE_UNISTD_H
 # include <unistd.h>
 #endif
 #if defined(HAVE_MMAP) || defined(HAVE_SYS_MMAN_H)
 # include <sys/mman.h>
 #else
 # error netcpu_looper requires mmap
 #endif

 #if TIME_WITH_SYS_TIME
 # include <sys/time.h>
 # include <time.h>
 #else
 # if HAVE_SYS_TIME_H
 #  include <sys/time.h>
 # else
 #  include <time.h>
 # endif
 #endif

 #if HAVE_SYS_TYPES_H
 # include <sys/types.h>
 #endif

 #if HAVE_SYS_WAIT_H
 # include <sys/wait.h>
 #endif

 #ifdef HAVE_SIGNAL_H
 #include <signal.h>
 #endif

 #ifdef HAVE_ERRNO_H
 #include <errno.h>
 #endif

 #include "netsh.h"
 #include "netlib.h"

 #define PAGES_PER_CHILD 2

 /* the lib_start_count and lib_end_count arrays hold the starting
    and ending values of whatever is counting when the system is
    idle. The rate at which this increments during a test is compared
    with a previous calibrarion to arrive at a CPU utilization
    percentage. raj 2005-01-26 */
 static uint64_t  lib_start_count[MAXCPUS];
 static uint64_t  lib_end_count[MAXCPUS];

 static int *cpu_mappings;

 static int lib_idle_fd;
 static uint64_t *lib_idle_address[MAXCPUS];
 static long     *lib_base_pointer;
 static pid_t     lib_idle_pids[MAXCPUS];
 static int       lib_loopers_running=0;

 /* we used to use this code to bind the loopers, but since we have
    decided to enable processor affinity for the actual
    netperf/netserver processes we will use that affinity routine,
    which happens to know about more systems than this */

 #ifdef NOTDEF
 static void
 bind_to_processor(int child_num)
 {
   /* This routine will bind the calling process to a particular */
   /* processor. We are not choosy as to which processor, so it will be */
   /* the process id mod the number of processors - shifted by one for */
   /* those systems which name processor starting from one instead of */
   /* zero. on those systems where I do not yet know how to bind a */
   /* process to a processor, this routine will be a no-op raj 10/95 */

   /* just as a reminder, this is *only* for the looper processes, not */
   /* the actual measurement processes. those will, should, MUST float */
   /* or not float from CPU to CPU as controlled by the operating */
   /* system defaults. raj 12/95 */

 #ifdef __hpux
 #include <sys/syscall.h>
 #include <sys/mp.h>

   int old_cpu = -2;

   if (debug) {
     fprintf(where,
             "child %d asking for CPU %d as pid %d with %d CPUs\n",
             child_num,
             (child_num % lib_num_loc_cpus),
             getpid(),
             lib_num_loc_cpus);
     fflush(where);
   }

   SETPROCESS((child_num % lib_num_loc_cpus), getpid());
   return;

 #else
 #if defined(__sun) && defined(__SVR4)
  /* should only be Solaris */
 #include <sys/processor.h>
 #include <sys/procset.h>

   int old_binding;

   if (debug) {
     fprintf(where,
             "bind_to_processor: child %d asking for CPU %d as pid %d with %d CPUs\n",
             child_num,
             (child_num % lib_num_loc_cpus),
             getpid(),
             lib_num_loc_cpus);
     fflush(where);
   }

   if (processor_bind(P_PID,
                      getpid(),
                      (child_num % lib_num_loc_cpus),
                       &old_binding) != 0) {
     fprintf(where,"bind_to_processor: unable to perform processor binding\n");
     fprintf(where,"                   errno %d\n",errno);
     fflush(where);
   }
   return;
 #else
 #ifdef WIN32

   if (!SetThreadAffinityMask(GetCurrentThread(), (ULONG_PTR)1 << (child_num % lib_num_loc_cpus))) {
     perror("SetThreadAffinityMask failed");
     fflush(stderr);
   }

   if (debug) {
     fprintf(where,
             "bind_to_processor: child %d asking for CPU %d of %d CPUs\n",
             child_num,
             (child_num % lib_num_loc_cpus),
             lib_num_loc_cpus);
     fflush(where);
   }

 #endif
   return;
 #endif /* __sun && _SVR4 */
 #endif /* __hpux */
 }
 #endif

  /* sit_and_spin will just spin about incrementing a value */
  /* this value will either be in a memory mapped region on Unix shared */
  /* by each looper process, or something appropriate on Windows/NT */
  /* (malloc'd or such). This routine is reasonably ugly in that it has */
  /* priority manipulating code for lots of different operating */
  /* systems. This routine never returns. raj 1/96 */

 static void
 sit_and_spin(int child_index)

 {
   uint64_t *my_counter_ptr;

  /* only use C stuff if we are not WIN32 unless and until we */
  /* switch from CreateThread to _beginthread. raj 1/96 */
 #ifndef WIN32
   /* we are the child. we could decide to exec some separate */
   /* program, but that doesn't really seem worthwhile - raj 4/95 */
   if (debug > 1) {
     fprintf(where,
             "Looper child %d is born, pid %d\n",
             child_index,
             getpid());
     fflush(where);
   }

 #endif /* WIN32 */

   /* reset our base pointer to be at the appropriate offset */
   my_counter_ptr = (uint64_t *) ((char *)lib_base_pointer +
                              (netlib_get_page_size() *
                               PAGES_PER_CHILD * child_index));

   /* in the event we are running on an MP system, it would */
   /* probably be good to bind the soaker processes to specific */
   /* processors. I *think* this is the most reasonable thing to */
   /* do, and would be closes to simulating the information we get */
   /* on HP-UX with pstat. I could put all the system-specific code */
   /* here, but will "abstract it into another routine to keep this */
   /* area more readable. I'll probably do the same thine with the */
   /* "low pri code" raj 10/95 */

   /* since we are "flying blind" wrt where we should bind the looper
      processes, we want to use the cpu_map that was prepared by netlib
      rather than assume that the CPU ids on the system start at zero
      and are contiguous. raj 2006-04-03 */
   bind_to_specific_processor(child_index % lib_num_loc_cpus,1);

   for (*my_counter_ptr = 0L;
        ;
        (*my_counter_ptr)++) {
     if (!(*lib_base_pointer % 1)) {
       /* every once and again, make sure that our process priority is */
       /* nice and low. also, by making system calls, it may be easier */
       /* for us to be pre-empted by something that needs to do useful */
       /* work - like the thread of execution actually sending and */
       /* receiving data across the network :) */
 #ifdef _AIX
       int pid,prio;

       prio = PRIORITY;
       pid = getpid();
       /* if you are not root, this call will return EPERM - why one */
       /* cannot change one's own priority to  lower value is beyond */
       /* me. raj 2/26/96 */
       setpri(pid, prio);
 #else /* _AIX */
 #ifdef __sgi
       int pid,prio;

       prio = PRIORITY;
       pid = getpid();
       schedctl(NDPRI, pid, prio);
       sginap(0);
 #else /* __sgi */
 #ifdef WIN32
       SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_IDLE);
 #else /* WIN32 */
 #if defined(__sun) && defined(__SVR4)
 #include <sys/types.h>
 #include <sys/priocntl.h>
 #include <sys/rtpriocntl.h>
 #include <sys/tspriocntl.h>
       /* I would *really* like to know how to use priocntl to make the */
       /* priority low for this looper process. however, either my mind */
       /* is addled, or the manpage in section two for priocntl is not */
       /* terribly helpful - for one, it has no examples :( so, if you */
       /* can help, I'd love to hear from you. in the meantime, we will */
       /* rely on nice(39). raj 2/26/96 */
       nice(39);
 #else /* __sun && __SVR4 */
       nice(39);
 #endif /* __sun && _SVR4 */
 #endif /* WIN32 */
 #endif /* __sgi */
 #endif /* _AIX */
     }
   }
 }


  /* this routine will start all the looper processes or threads for */
  /* measuring CPU utilization. */

 static void
 start_looper_processes()
 {

   unsigned int      i, file_size;

   /* we want at least two pages for each processor. the */
   /* child for any one processor will write to the first of his two */
   /* pages, and the second page will be a buffer in case there is page */
   /* prefetching. if your system pre-fetches more than a single page, */
   /* well, you'll have to modify this or live with it :( raj 4/95 */

   file_size = ((netlib_get_page_size() * PAGES_PER_CHILD) *
                lib_num_loc_cpus);

 #ifndef WIN32

   /* we we are not using WINDOWS NT (or 95 actually :), then we want */
   /* to create a memory mapped region so we can see all the counting */
   /* rates of the loopers */

   /* could we just use an anonymous memory region for this? it is */
   /* possible that using a mmap()'ed "real" file, while convenient for */
   /* debugging, could result in some filesystem activity - like */
   /* metadata updates? raj 4/96 */
   lib_idle_fd = open("/tmp/netperf_cpu",O_RDWR | O_CREAT | O_EXCL);

   if (lib_idle_fd == -1) {
     fprintf(where,"create_looper: file creation; errno %d\n",errno);
     fflush(where);
     exit(1);
   }

   if (chmod("/tmp/netperf_cpu",0644) == -1) {
     fprintf(where,"create_looper: chmod; errno %d\n",errno);
     fflush(where);
     exit(1);
   }

   /* with the file descriptor in place, lets be sure that the file is */
   /* large enough. */

   if (truncate("/tmp/netperf_cpu",file_size) == -1) {
     fprintf(where,"create_looper: truncate: errno %d\n",errno);
     fflush(where);
     exit(1);
   }

   /* the file should be large enough now, so we can mmap it */

   /* if the system does not have MAP_VARIABLE, just define it to */
   /* be zero. it is only used/needed on HP-UX (?) raj 4/95 */
 #ifndef MAP_VARIABLE
 #define MAP_VARIABLE 0x0000
 #endif /* MAP_VARIABLE */
 #ifndef MAP_FILE
 #define MAP_FILE 0x0000
 #endif /* MAP_FILE */
   if ((lib_base_pointer = (long *)mmap(NULL,
                                        file_size,
                                        PROT_READ | PROT_WRITE,
                                        MAP_FILE | MAP_SHARED | MAP_VARIABLE,
                                        lib_idle_fd,
                                        0)) == (long *)-1) {
     fprintf(where,"create_looper: mmap: errno %d\n",errno);
     fflush(where);
     exit(1);
   }


   if (debug > 1) {
     fprintf(where,"num CPUs %d, file_size %d, lib_base_pointer %p\n",
             lib_num_loc_cpus,
             file_size,
             lib_base_pointer);
     fflush(where);
   }

   /* we should have a valid base pointer. lets fork */

   for (i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
     switch (lib_idle_pids[i] = fork()) {
     case -1:
       perror("netperf: fork");
       exit(1);
     case 0:
       /* we are the child. we could decide to exec some separate */
       /* program, but that doesn't really seem worthwhile - raj 4/95 */

       signal(SIGTERM, SIG_DFL);
       sit_and_spin(i);

       /* we should never really get here, but if we do, just exit(0) */
       exit(0);
       break;
     default:
       /* we must be the parent */
       lib_idle_address[i] = (uint64_t *) ((char *)lib_base_pointer +
                                       (netlib_get_page_size() *
                                        PAGES_PER_CHILD * i));
       if (debug) {
         fprintf(where,"lib_idle_address[%d] is %p\n",
                 i,
                 lib_idle_address[i]);
         fflush(where);
       }
     }
   }
 #else
   /* we are compiled -DWIN32 */
   if ((lib_base_pointer = malloc(file_size)) == NULL) {
     fprintf(where,
             "create_looper_process could not malloc %d bytes\n",
             file_size);
     fflush(where);
     exit(1);
   }

   /* now, create all the threads */
   for(i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
     long place_holder;
     if ((lib_idle_pids[i] = CreateThread(0,
                                          0,
                                          (LPTHREAD_START_ROUTINE)sit_and_spin,
                                          (LPVOID)(ULONG_PTR)i,
                                          0,
                                          &place_holder)) == NULL ) {
       fprintf(where,
               "create_looper_process: CreateThread failed\n");
       fflush(where);
       /* I wonder if I need to look for other threads to kill? */
       exit(1);
     }
     lib_idle_address[i] = (long *) ((char *)lib_base_pointer +
                                     (netlib_get_page_size() *
                                      PAGES_PER_CHILD * i));
     if (debug) {
       fprintf(where,"lib_idle_address[%d] is %p\n",
               i,
               lib_idle_address[i]);
       fflush(where);
     }
   }
 #endif /* WIN32 */

   /* we need to have the looper processes settled-in before we do */
   /* anything with them, so lets sleep for say 30 seconds. raj 4/95 */

   sleep(30);
 }

 void
 cpu_util_init(void)
 {
   cpu_method = LOOPER;

   /* we want to get the looper processes going */
   if (!lib_loopers_running) {
     start_looper_processes();
     lib_loopers_running = 1;
   }

   return;
 }

 /* clean-up any left-over CPU util resources - looper processes,
    files, whatever.  raj 2005-01-26 */
 void
 cpu_util_terminate() {

 #ifdef WIN32
   /* it would seem that if/when the process exits, all the threads */
   /* will go away too, so I don't think I need any explicit thread */
   /* killing calls here. raj 1/96 */
 #else

   int i;

   /* now go through and kill-off all the child processes */
   for (i = 0; i < lib_num_loc_cpus; i++){
     /* SIGKILL can leave core files behind - thanks to Steinar Haug */
     /* for pointing that out. */
     kill(lib_idle_pids[i],SIGTERM);
   }
   lib_loopers_running = 0;
   /* reap the children */
   while(waitpid(-1, NULL, WNOHANG) > 0) { }

   /* finally, unlink the mmaped file */
   munmap((caddr_t)lib_base_pointer,
          ((netlib_get_page_size() * PAGES_PER_CHILD) *
           lib_num_loc_cpus));
   unlink("/tmp/netperf_cpu");
 #endif
   return;
 }

 int
 get_cpu_method(void)
 {
   return LOOPER;
 }

  /* calibrate_looper */

  /* Loop a number of iterations, sleeping interval seconds each and */
  /* count how high the idle counter gets each time. Return  the */
  /* measured cpu rate to the calling routine. raj 4/95 */

 float
 calibrate_idle_rate (int iterations, int interval)
 {

   uint64_t
     firstcnt[MAXCPUS],
     secondcnt[MAXCPUS];

   float
     elapsed,
     temp_rate,
     rate[MAXTIMES],
     local_maxrate;

   long
     sec,
     usec;

   int
     i,
     j;

   struct  timeval time1, time2 ;
   struct  timezone tz;

   if (iterations > MAXTIMES) {
     iterations = MAXTIMES;
   }

   local_maxrate = (float)-1.0;

   for(i = 0; i < iterations; i++) {
     rate[i] = (float)0.0;
     for (j = 0; j < lib_num_loc_cpus; j++) {
       firstcnt[j] = *(lib_idle_address[j]);
     }
     gettimeofday (&time1, &tz);
     sleep(interval);
     gettimeofday (&time2, &tz);

     if (time2.tv_usec < time1.tv_usec)
       {
         time2.tv_usec += 1000000;
         time2.tv_sec -=1;
       }
     sec = time2.tv_sec - time1.tv_sec;
     usec = time2.tv_usec - time1.tv_usec;
     elapsed = (float)sec + ((float)usec/(float)1000000.0);

     if(debug) {
       fprintf(where, "Calibration for counter run: %d\n",i);
       fprintf(where,"\tsec = %ld usec = %ld\n",sec,usec);
       fprintf(where,"\telapsed time = %g\n",elapsed);
     }

     for (j = 0; j < lib_num_loc_cpus; j++) {
       secondcnt[j] = *(lib_idle_address[j]);
       if(debug) {
         /* I know that there are situations where compilers know about */
         /* long long, but the library fucntions do not... raj 4/95 */
         fprintf(where,
                 "\tfirstcnt[%d] = 0x%8.8lx%8.8lx secondcnt[%d] = 0x%8.8lx%8.8lx\n",
                 j,
                 (uint32_t)(firstcnt[j]>>32),
                 (uint32_t)(firstcnt[j]&0xffffffff),
                 j,
                 (uint32_t)(secondcnt[j]>>32),
                 (uint32_t)(secondcnt[j]&0xffffffff));
       }
       /* we assume that it would wrap no more than once. we also */
       /* assume that the result of subtracting will "fit" raj 4/95 */
       temp_rate = (secondcnt[j] >= firstcnt[j]) ?
         (float)(secondcnt[j] - firstcnt[j])/elapsed :
           (float)(secondcnt[j]-firstcnt[j]+MAXLONG)/elapsed;
       if (temp_rate > rate[i]) rate[i] = temp_rate;
       if(debug) {
         fprintf(where,"\trate[%d] = %g\n",i,rate[i]);
         fflush(where);
       }
       if (local_maxrate < rate[i]) local_maxrate = rate[i];
     }
   }
   if(debug) {
     fprintf(where,"\tlocal maxrate = %g per sec. \n",local_maxrate);
     fflush(where);
   }
   return local_maxrate;
 }


 void
 get_cpu_idle (uint64_t *res)
 {
   int i;

   for (i = 0; i < lib_num_loc_cpus; i++){
     res[i] = *lib_idle_address[i];
   }

 }

 float
 calc_cpu_util_internal(float elapsed_time)
 {
   int i;
   float correction_factor;
   float actual_rate;

   lib_local_cpu_util = (float)0.0;
   /* It is possible that the library measured a time other than */
   /* the one that the user want for the cpu utilization */
   /* calculations - for example, tests that were ended by */
   /* watchdog timers such as the udp stream test. We let these */
   /* tests tell up what the elapsed time should be. */

   if (elapsed_time != 0.0) {
     correction_factor = (float) 1.0 +
       ((lib_elapsed - elapsed_time) / elapsed_time);
   }
   else {
     correction_factor = (float) 1.0;
   }

   for (i = 0; i < lib_num_loc_cpus; i++) {

     /* it would appear that on some systems, in loopback, nice is
      *very* effective, causing the looper process to stop dead in its
      tracks. if this happens, we need to ensure that the calculation
      does not go south. raj 6/95 and if we run completely out of idle,
      the same thing could in theory happen to the USE_KSTAT path. raj
      8/2000 */

     if (lib_end_count[i] == lib_start_count[i]) {
       lib_end_count[i]++;
     }

     actual_rate = (lib_end_count[i] > lib_start_count[i]) ?
       (float)(lib_end_count[i] - lib_start_count[i])/lib_elapsed :
       (float)(lib_end_count[i] - lib_start_count[i] +
 	      MAXLONG)/ lib_elapsed;
     if (debug) {
       fprintf(where,
               "calc_cpu_util: actual_rate on processor %d is %f start 0x%8.8lx%8.8lx end 0x%8.8lx%8.8lx\n",
               i,
               actual_rate,
               (uint32_t)(lib_start_count[i]>>32),
               (uint32_t)(lib_start_count[i]&0xffffffff),
               (uint32_t)(lib_end_count[i]>>32),
               (uint32_t)(lib_end_count[i]&0xffffffff));
     }
     lib_local_per_cpu_util[i] = (lib_local_maxrate - actual_rate) /
       lib_local_maxrate * 100;
     lib_local_cpu_util += lib_local_per_cpu_util[i];
   }
   /* we want the average across all n processors */
   lib_local_cpu_util /= (float)lib_num_loc_cpus;

   lib_local_cpu_util *= correction_factor;
   return lib_local_cpu_util;


 }
 void
 cpu_start_internal(void)
 {
   get_cpu_idle(lib_start_count);
   return;
 }

 void
 cpu_stop_internal(void)
 {
   get_cpu_idle(lib_end_count);
 }
	char netcpu_looper_id[]="\
	@(#)netcpu_looper.c (c) Copyright 2005-2007. Version 2.4.3";

	/* netcpu_looper.c

	Implement the soaker process specific portions of netperf CPU
	utilization measurements. These are broken-out into a separate file
	to make life much nicer over in netlib.c which had become a maze of
	twisty, CPU-util-related, #ifdefs, all different. raj 2005-01-26
	*/

	#ifdef HAVE_CONFIG_H
	#include <config.h>
	#endif

	#include <stdio.h>

	#ifdef HAVE_FCNTL_H
	# include <fcntl.h>
	#endif
	#if HAVE_UNISTD_H
	# include <unistd.h>
	#endif
	#if defined(HAVE_MMAP) \|\| defined(HAVE_SYS_MMAN_H)
	# include <sys/mman.h>
	#else
	# error netcpu_looper requires mmap
	#endif

	#if TIME_WITH_SYS_TIME
	# include <sys/time.h>
	# include <time.h>
	#else
	# if HAVE_SYS_TIME_H
	# include <sys/time.h>
	# else
	# include <time.h>
	# endif
	#endif

	#if HAVE_SYS_TYPES_H
	# include <sys/types.h>
	#endif

	#if HAVE_SYS_WAIT_H
	# include <sys/wait.h>
	#endif

	#ifdef HAVE_SIGNAL_H
	#include <signal.h>
	#endif

	#ifdef HAVE_ERRNO_H
	#include <errno.h>
	#endif

	#include "netsh.h"
	#include "netlib.h"

	#define PAGES_PER_CHILD 2

	/* the lib_start_count and lib_end_count arrays hold the starting
	and ending values of whatever is counting when the system is
	idle. The rate at which this increments during a test is compared
	with a previous calibrarion to arrive at a CPU utilization
	percentage. raj 2005-01-26 */
	static uint64_t lib_start_count[MAXCPUS];
	static uint64_t lib_end_count[MAXCPUS];

	static int *cpu_mappings;

	static int lib_idle_fd;
	static uint64_t *lib_idle_address[MAXCPUS];
	static long *lib_base_pointer;
	static pid_t lib_idle_pids[MAXCPUS];
	static int lib_loopers_running=0;

	/* we used to use this code to bind the loopers, but since we have
	decided to enable processor affinity for the actual
	netperf/netserver processes we will use that affinity routine,
	which happens to know about more systems than this */

	#ifdef NOTDEF
	static void
	bind_to_processor(int child_num)
	{
	/* This routine will bind the calling process to a particular */
	/* processor. We are not choosy as to which processor, so it will be */
	/* the process id mod the number of processors - shifted by one for */
	/* those systems which name processor starting from one instead of */
	/* zero. on those systems where I do not yet know how to bind a */
	/* process to a processor, this routine will be a no-op raj 10/95 */

	/* just as a reminder, this is only for the looper processes, not */
	/* the actual measurement processes. those will, should, MUST float */
	/* or not float from CPU to CPU as controlled by the operating */
	/* system defaults. raj 12/95 */

	#ifdef __hpux
	#include <sys/syscall.h>
	#include <sys/mp.h>

	int old_cpu = -2;

	if (debug) {
	fprintf(where,
	"child %d asking for CPU %d as pid %d with %d CPUs\n",
	child_num,
	(child_num % lib_num_loc_cpus),
	getpid(),
	lib_num_loc_cpus);
	fflush(where);
	}

	SETPROCESS((child_num % lib_num_loc_cpus), getpid());
	return;

	#else
	#if defined(__sun) && defined(__SVR4)
	/* should only be Solaris */
	#include <sys/processor.h>
	#include <sys/procset.h>

	int old_binding;

	if (debug) {
	fprintf(where,
	"bind_to_processor: child %d asking for CPU %d as pid %d with %d CPUs\n",
	child_num,
	(child_num % lib_num_loc_cpus),
	getpid(),
	lib_num_loc_cpus);
	fflush(where);
	}

	if (processor_bind(P_PID,
	getpid(),
	(child_num % lib_num_loc_cpus),
	&old_binding) != 0) {
	fprintf(where,"bind_to_processor: unable to perform processor binding\n");
	fprintf(where," errno %d\n",errno);
	fflush(where);
	}
	return;
	#else
	#ifdef WIN32

	if (!SetThreadAffinityMask(GetCurrentThread(), (ULONG_PTR)1 << (child_num % lib_num_loc_cpus))) {
	perror("SetThreadAffinityMask failed");
	fflush(stderr);
	}

	if (debug) {
	fprintf(where,
	"bind_to_processor: child %d asking for CPU %d of %d CPUs\n",
	child_num,
	(child_num % lib_num_loc_cpus),
	lib_num_loc_cpus);
	fflush(where);
	}

	#endif
	return;
	#endif /* __sun && _SVR4 */
	#endif /* __hpux */
	}
	#endif

	/* sit_and_spin will just spin about incrementing a value */
	/* this value will either be in a memory mapped region on Unix shared */
	/* by each looper process, or something appropriate on Windows/NT */
	/* (malloc'd or such). This routine is reasonably ugly in that it has */
	/* priority manipulating code for lots of different operating */
	/* systems. This routine never returns. raj 1/96 */

	static void
	sit_and_spin(int child_index)

	{
	uint64_t *my_counter_ptr;

	/* only use C stuff if we are not WIN32 unless and until we */
	/* switch from CreateThread to _beginthread. raj 1/96 */
	#ifndef WIN32
	/* we are the child. we could decide to exec some separate */
	/* program, but that doesn't really seem worthwhile - raj 4/95 */
	if (debug > 1) {
	fprintf(where,
	"Looper child %d is born, pid %d\n",
	child_index,
	getpid());
	fflush(where);
	}

	#endif /* WIN32 */

	/* reset our base pointer to be at the appropriate offset */
	my_counter_ptr = (uint64_t ) ((char )lib_base_pointer +
	(netlib_get_page_size() *
	PAGES_PER_CHILD * child_index));

	/* in the event we are running on an MP system, it would */
	/* probably be good to bind the soaker processes to specific */
	/* processors. I think this is the most reasonable thing to */
	/* do, and would be closes to simulating the information we get */
	/* on HP-UX with pstat. I could put all the system-specific code */
	/* here, but will "abstract it into another routine to keep this */
	/* area more readable. I'll probably do the same thine with the */
	/* "low pri code" raj 10/95 */

	/* since we are "flying blind" wrt where we should bind the looper
	processes, we want to use the cpu_map that was prepared by netlib
	rather than assume that the CPU ids on the system start at zero
	and are contiguous. raj 2006-04-03 */
	bind_to_specific_processor(child_index % lib_num_loc_cpus,1);

	for (*my_counter_ptr = 0L;
	;
	(*my_counter_ptr)++) {
	if (!(*lib_base_pointer % 1)) {
	/* every once and again, make sure that our process priority is */
	/* nice and low. also, by making system calls, it may be easier */
	/* for us to be pre-empted by something that needs to do useful */
	/* work - like the thread of execution actually sending and */
	/* receiving data across the network :) */
	#ifdef _AIX
	int pid,prio;

	prio = PRIORITY;
	pid = getpid();
	/* if you are not root, this call will return EPERM - why one */
	/* cannot change one's own priority to lower value is beyond */
	/* me. raj 2/26/96 */
	setpri(pid, prio);
	#else /* _AIX */
	#ifdef __sgi
	int pid,prio;

	prio = PRIORITY;
	pid = getpid();
	schedctl(NDPRI, pid, prio);
	sginap(0);
	#else /* __sgi */
	#ifdef WIN32
	SetThreadPriority(GetCurrentThread(),THREAD_PRIORITY_IDLE);
	#else /* WIN32 */
	#if defined(__sun) && defined(__SVR4)
	#include <sys/types.h>
	#include <sys/priocntl.h>
	#include <sys/rtpriocntl.h>
	#include <sys/tspriocntl.h>
	/* I would really like to know how to use priocntl to make the */
	/* priority low for this looper process. however, either my mind */
	/* is addled, or the manpage in section two for priocntl is not */
	/* terribly helpful - for one, it has no examples :( so, if you */
	/* can help, I'd love to hear from you. in the meantime, we will */
	/* rely on nice(39). raj 2/26/96 */
	nice(39);
	#else /* __sun && __SVR4 */
	nice(39);
	#endif /* __sun && _SVR4 */
	#endif /* WIN32 */
	#endif /* __sgi */
	#endif /* _AIX */
	}
	}
	}



	/* this routine will start all the looper processes or threads for */
	/* measuring CPU utilization. */

	static void
	start_looper_processes()
	{

	unsigned int i, file_size;

	/* we want at least two pages for each processor. the */
	/* child for any one processor will write to the first of his two */
	/* pages, and the second page will be a buffer in case there is page */
	/* prefetching. if your system pre-fetches more than a single page, */
	/* well, you'll have to modify this or live with it :( raj 4/95 */

	file_size = ((netlib_get_page_size() * PAGES_PER_CHILD) *
	lib_num_loc_cpus);

	#ifndef WIN32

	/* we we are not using WINDOWS NT (or 95 actually :), then we want */
	/* to create a memory mapped region so we can see all the counting */
	/* rates of the loopers */

	/* could we just use an anonymous memory region for this? it is */
	/* possible that using a mmap()'ed "real" file, while convenient for */
	/* debugging, could result in some filesystem activity - like */
	/* metadata updates? raj 4/96 */
	lib_idle_fd = open("/tmp/netperf_cpu",O_RDWR \| O_CREAT \| O_EXCL);

	if (lib_idle_fd == -1) {
	fprintf(where,"create_looper: file creation; errno %d\n",errno);
	fflush(where);
	exit(1);
	}

	if (chmod("/tmp/netperf_cpu",0644) == -1) {
	fprintf(where,"create_looper: chmod; errno %d\n",errno);
	fflush(where);
	exit(1);
	}

	/* with the file descriptor in place, lets be sure that the file is */
	/* large enough. */

	if (truncate("/tmp/netperf_cpu",file_size) == -1) {
	fprintf(where,"create_looper: truncate: errno %d\n",errno);
	fflush(where);
	exit(1);
	}

	/* the file should be large enough now, so we can mmap it */

	/* if the system does not have MAP_VARIABLE, just define it to */
	/* be zero. it is only used/needed on HP-UX (?) raj 4/95 */
	#ifndef MAP_VARIABLE
	#define MAP_VARIABLE 0x0000
	#endif /* MAP_VARIABLE */
	#ifndef MAP_FILE
	#define MAP_FILE 0x0000
	#endif /* MAP_FILE */
	if ((lib_base_pointer = (long *)mmap(NULL,
	file_size,
	PROT_READ \| PROT_WRITE,
	MAP_FILE \| MAP_SHARED \| MAP_VARIABLE,
	lib_idle_fd,
	0)) == (long *)-1) {
	fprintf(where,"create_looper: mmap: errno %d\n",errno);
	fflush(where);
	exit(1);
	}


	if (debug > 1) {
	fprintf(where,"num CPUs %d, file_size %d, lib_base_pointer %p\n",
	lib_num_loc_cpus,
	file_size,
	lib_base_pointer);
	fflush(where);
	}

	/* we should have a valid base pointer. lets fork */

	for (i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
	switch (lib_idle_pids[i] = fork()) {
	case -1:
	perror("netperf: fork");
	exit(1);
	case 0:
	/* we are the child. we could decide to exec some separate */
	/* program, but that doesn't really seem worthwhile - raj 4/95 */

	signal(SIGTERM, SIG_DFL);
	sit_and_spin(i);

	/* we should never really get here, but if we do, just exit(0) */
	exit(0);
	break;
	default:
	/* we must be the parent */
	lib_idle_address[i] = (uint64_t ) ((char )lib_base_pointer +
	(netlib_get_page_size() *
	PAGES_PER_CHILD * i));
	if (debug) {
	fprintf(where,"lib_idle_address[%d] is %p\n",
	i,
	lib_idle_address[i]);
	fflush(where);
	}
	}
	}
	#else
	/* we are compiled -DWIN32 */
	if ((lib_base_pointer = malloc(file_size)) == NULL) {
	fprintf(where,
	"create_looper_process could not malloc %d bytes\n",
	file_size);
	fflush(where);
	exit(1);
	}

	/* now, create all the threads */
	for(i = 0; i < (unsigned int)lib_num_loc_cpus; i++) {
	long place_holder;
	if ((lib_idle_pids[i] = CreateThread(0,
	0,
	(LPTHREAD_START_ROUTINE)sit_and_spin,
	(LPVOID)(ULONG_PTR)i,
	0,
	&place_holder)) == NULL ) {
	fprintf(where,
	"create_looper_process: CreateThread failed\n");
	fflush(where);
	/* I wonder if I need to look for other threads to kill? */
	exit(1);
	}
	lib_idle_address[i] = (long ) ((char )lib_base_pointer +
	(netlib_get_page_size() *
	PAGES_PER_CHILD * i));
	if (debug) {
	fprintf(where,"lib_idle_address[%d] is %p\n",
	i,
	lib_idle_address[i]);
	fflush(where);
	}
	}
	#endif /* WIN32 */

	/* we need to have the looper processes settled-in before we do */
	/* anything with them, so lets sleep for say 30 seconds. raj 4/95 */

	sleep(30);
	}

	void
	cpu_util_init(void)
	{
	cpu_method = LOOPER;

	/* we want to get the looper processes going */
	if (!lib_loopers_running) {
	start_looper_processes();
	lib_loopers_running = 1;
	}

	return;
	}

	/* clean-up any left-over CPU util resources - looper processes,
	files, whatever. raj 2005-01-26 */
	void
	cpu_util_terminate() {

	#ifdef WIN32
	/* it would seem that if/when the process exits, all the threads */
	/* will go away too, so I don't think I need any explicit thread */
	/* killing calls here. raj 1/96 */
	#else

	int i;

	/* now go through and kill-off all the child processes */
	for (i = 0; i < lib_num_loc_cpus; i++){
	/* SIGKILL can leave core files behind - thanks to Steinar Haug */
	/* for pointing that out. */
	kill(lib_idle_pids[i],SIGTERM);
	}
	lib_loopers_running = 0;
	/* reap the children */
	while(waitpid(-1, NULL, WNOHANG) > 0) { }

	/* finally, unlink the mmaped file */
	munmap((caddr_t)lib_base_pointer,
	((netlib_get_page_size() * PAGES_PER_CHILD) *
	lib_num_loc_cpus));
	unlink("/tmp/netperf_cpu");
	#endif
	return;
	}

	int
	get_cpu_method(void)
	{
	return LOOPER;
	}

	/* calibrate_looper */

	/* Loop a number of iterations, sleeping interval seconds each and */
	/* count how high the idle counter gets each time. Return the */
	/* measured cpu rate to the calling routine. raj 4/95 */

	float
	calibrate_idle_rate (int iterations, int interval)
	{

	uint64_t
	firstcnt[MAXCPUS],
	secondcnt[MAXCPUS];

	float
	elapsed,
	temp_rate,
	rate[MAXTIMES],
	local_maxrate;

	long
	sec,
	usec;

	int
	i,
	j;

	struct timeval time1, time2 ;
	struct timezone tz;

	if (iterations > MAXTIMES) {
	iterations = MAXTIMES;
	}

	local_maxrate = (float)-1.0;

	for(i = 0; i < iterations; i++) {
	rate[i] = (float)0.0;
	for (j = 0; j < lib_num_loc_cpus; j++) {
	firstcnt[j] = *(lib_idle_address[j]);
	}
	gettimeofday (&time1, &tz);
	sleep(interval);
	gettimeofday (&time2, &tz);

	if (time2.tv_usec < time1.tv_usec)
	{
	time2.tv_usec += 1000000;
	time2.tv_sec -=1;
	}
	sec = time2.tv_sec - time1.tv_sec;
	usec = time2.tv_usec - time1.tv_usec;
	elapsed = (float)sec + ((float)usec/(float)1000000.0);

	if(debug) {
	fprintf(where, "Calibration for counter run: %d\n",i);
	fprintf(where,"\tsec = %ld usec = %ld\n",sec,usec);
	fprintf(where,"\telapsed time = %g\n",elapsed);
	}

	for (j = 0; j < lib_num_loc_cpus; j++) {
	secondcnt[j] = *(lib_idle_address[j]);
	if(debug) {
	/* I know that there are situations where compilers know about */
	/* long long, but the library fucntions do not... raj 4/95 */
	fprintf(where,
	"\tfirstcnt[%d] = 0x%8.8lx%8.8lx secondcnt[%d] = 0x%8.8lx%8.8lx\n",
	j,
	(uint32_t)(firstcnt[j]>>32),
	(uint32_t)(firstcnt[j]&0xffffffff),
	j,
	(uint32_t)(secondcnt[j]>>32),
	(uint32_t)(secondcnt[j]&0xffffffff));
	}
	/* we assume that it would wrap no more than once. we also */
	/* assume that the result of subtracting will "fit" raj 4/95 */
	temp_rate = (secondcnt[j] >= firstcnt[j]) ?
	(float)(secondcnt[j] - firstcnt[j])/elapsed :
	(float)(secondcnt[j]-firstcnt[j]+MAXLONG)/elapsed;
	if (temp_rate > rate[i]) rate[i] = temp_rate;
	if(debug) {
	fprintf(where,"\trate[%d] = %g\n",i,rate[i]);
	fflush(where);
	}
	if (local_maxrate < rate[i]) local_maxrate = rate[i];
	}
	}
	if(debug) {
	fprintf(where,"\tlocal maxrate = %g per sec. \n",local_maxrate);
	fflush(where);
	}
	return local_maxrate;
	}


	void
	get_cpu_idle (uint64_t *res)
	{
	int i;

	for (i = 0; i < lib_num_loc_cpus; i++){
	res[i] = *lib_idle_address[i];
	}

	}

	float
	calc_cpu_util_internal(float elapsed_time)
	{
	int i;
	float correction_factor;
	float actual_rate;

	lib_local_cpu_util = (float)0.0;
	/* It is possible that the library measured a time other than */
	/* the one that the user want for the cpu utilization */
	/* calculations - for example, tests that were ended by */
	/* watchdog timers such as the udp stream test. We let these */
	/* tests tell up what the elapsed time should be. */

	if (elapsed_time != 0.0) {
	correction_factor = (float) 1.0 +
	((lib_elapsed - elapsed_time) / elapsed_time);
	}
	else {
	correction_factor = (float) 1.0;
	}

	for (i = 0; i < lib_num_loc_cpus; i++) {

	/* it would appear that on some systems, in loopback, nice is
	very effective, causing the looper process to stop dead in its
	tracks. if this happens, we need to ensure that the calculation
	does not go south. raj 6/95 and if we run completely out of idle,
	the same thing could in theory happen to the USE_KSTAT path. raj
	8/2000 */

	if (lib_end_count[i] == lib_start_count[i]) {
	lib_end_count[i]++;
	}

	actual_rate = (lib_end_count[i] > lib_start_count[i]) ?
	(float)(lib_end_count[i] - lib_start_count[i])/lib_elapsed :
	(float)(lib_end_count[i] - lib_start_count[i] +
	MAXLONG)/ lib_elapsed;
	if (debug) {
	fprintf(where,
	"calc_cpu_util: actual_rate on processor %d is %f start 0x%8.8lx%8.8lx end 0x%8.8lx%8.8lx\n",
	i,
	actual_rate,
	(uint32_t)(lib_start_count[i]>>32),
	(uint32_t)(lib_start_count[i]&0xffffffff),
	(uint32_t)(lib_end_count[i]>>32),
	(uint32_t)(lib_end_count[i]&0xffffffff));
	}
	lib_local_per_cpu_util[i] = (lib_local_maxrate - actual_rate) /
	lib_local_maxrate * 100;
	lib_local_cpu_util += lib_local_per_cpu_util[i];
	}
	/* we want the average across all n processors */
	lib_local_cpu_util /= (float)lib_num_loc_cpus;

	lib_local_cpu_util *= correction_factor;
	return lib_local_cpu_util;


	}
	void
	cpu_start_internal(void)
	{
	get_cpu_idle(lib_start_count);
	return;
	}

	void
	cpu_stop_internal(void)
	{
	get_cpu_idle(lib_end_count);
	}