[CSC 435] And the pthreads...

Sat Mar 26 13:35:07 EDT 2016

Attached is the pthreads code with a makefile I used for it.  You will 
note that I tried closely to follow the examples in the Pthreads text 
(page 56 in the edition I have).

Again, my recommendation is that you create a branch on your babyblas 
repository and stick this there for testing and reference.

-- 
Andrew J. Pounds, Ph.D.  (pounds_aj at mercer.edu)
Professor of Chemistry and Computer Science
Mercer University,  Macon, GA 31207   (478) 301-5627
http://faculty.mercer.edu/pounds_aj

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://theochem.mercer.edu/pipermail/csc435/attachments/20160326/58c245f2/attachment.html>
-------------- next part --------------
#ifdef __cplusplus
extern "C" {
#endif
    void mmm_( int *threads, int *len,  double *a, double *b, double*c );
#ifdef __cplusplus
}
#endif

#ifdef PTHREADS

/*   P T H R E A D S    S E C T I O N  */

#include <stdlib.h>
#include <pthread.h>

// Define a "global" structure here 
typedef struct {
    int id;
    int veclen;
    int rowstart;
    int stride;
    double *A;
    double *B;
    double *C;
} matrix_worker;

// This is the actual thread worker 
void mmmp(matrix_worker *work_orderp){

    double *MA = work_orderp->A;
    double *MB = work_orderp->B;
    double *MC = work_orderp->C;

    int veclen = work_orderp->veclen;
    int rowstart = work_orderp->rowstart;
    int rowstop = rowstart + work_orderp->stride;

    int i, j, k;
    int strip, mod;
#ifdef STRIP8
    strip = 8;
    mod = veclen % strip;

    for (i=rowstart; i<rowstop; i++) {
        for (j=0; j<veclen; j++) {
            *(MC+(i*veclen+j)) = 0.0;
            for (k=0;k<mod;k++) {
                *(MC+(i*veclen+j)) += *(MA+(i*veclen+k)) * *(MB+(k*veclen+j));
            }
            for (k=mod;k<veclen;k+=strip) {
                *(MC+(i*veclen+j)) += *(MA+(i*veclen+k  )) * *(MB+( k   *veclen+j))
                    + *(MA+(i*veclen+k+1)) * *(MB+((k+1)*veclen+j))
                    + *(MA+(i*veclen+k+2)) * *(MB+((k+2)*veclen+j))
                    + *(MA+(i*veclen+k+3)) * *(MB+((k+3)*veclen+j));
                + *(MA+(i*veclen+k+4)) * *(MB+((k+4)*veclen+j));
                + *(MA+(i*veclen+k+5)) * *(MB+((k+5)*veclen+j));
                + *(MA+(i*veclen+k+6)) * *(MB+((k+6)*veclen+j));
                + *(MA+(i*veclen+k+7)) * *(MB+((k+7)*veclen+j));
            }
        }
    }

#else

    strip = 4;
    mod = veclen % strip;

    for (i=rowstart; i<rowstop; i++) {
        for (j=0; j<veclen; j++) {
            *(MC+(i*veclen+j)) = 0.0;
            for (k=0;k<mod;k++) {
                *(MC+(i*veclen+j)) += *(MA+(i*veclen+k)) * *(MB+(k*veclen+j));
            }
            for (k=mod;k<veclen;k+=strip) {
                *(MC+(i*veclen+j)) += *(MA+(i*veclen+k  )) * *(MB+( k   *veclen+j))
                    + *(MA+(i*veclen+k+1)) * *(MB+((k+1)*veclen+j))
                    + *(MA+(i*veclen+k+2)) * *(MB+((k+2)*veclen+j))
                    + *(MA+(i*veclen+k+3)) * *(MB+((k+3)*veclen+j));
            }
        }
    }
#endif
}

void mmm_( int *threads, int *len,  double *a, double *b, double *c ){

// This is the matrix multiply code that is called to orchestrate the thread workers
// void mmm_( int *len,  double *a, double *b, double *c ){

int i;
int start = 0;
pthread_t task_id[*threads];
int *steps = malloc(*threads*sizeof(int));
matrix_worker *work_orderp;

// Divide the code up equally amongst all of the threads
if (  *len % *threads == 0 ) {
    for (i=0;i<*threads;i++) *(steps+i) = *len / *threads;
}
else {
    for (i=0;i<*threads;i++) *(steps+i) = *len / *threads;
    for (i=0;i<*len % *threads;i++) *(steps+i) = *(steps+i) + 1;
}

start = 0;
for (i=0;i<*threads;i++) {

    work_orderp = (matrix_worker *) malloc(sizeof(matrix_worker));
    work_orderp->id = i;
    work_orderp->veclen = *len;
    work_orderp->rowstart = start;
    work_orderp->stride = *(steps+i);
    work_orderp->A = a;
    work_orderp->B = b;
    work_orderp->C = c;

    pthread_create(&(task_id[i]), NULL, (void *) mmmp, (void *) work_orderp);

    start += work_orderp->stride;
}

for (i=0; i<*threads; i++) pthread_join(task_id[i], NULL);

}

#else

/*  S E R I A L   C O D E  */

void mmm_( int *threads, int *len,  double *a, double *b, double *c ){

    int i, j, k;
    int veclen = *len;
    int mod;

#ifdef STRIP8
    const int stride = 8;

    mod = veclen % stride;

    for (i=0; i<veclen; i++) {
        for (j=0; j<veclen; j++) {
            *(c+(i*veclen+j)) = 0.0;
            for (k=0;k<mod;k++){
                *(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j)); 
            }
            for (k=mod;k<veclen;k+=stride) {
                *(c+(i*veclen+j)) += *(a+(i*veclen+k  )) * *(b+( k   *veclen+j)) 
                    + *(a+(i*veclen+k+1)) * *(b+((k+1)*veclen+j)) 
                    + *(a+(i*veclen+k+2)) * *(b+((k+2)*veclen+j)) 
                    + *(a+(i*veclen+k+3)) * *(b+((k+3)*veclen+j)) 
                    + *(a+(i*veclen+k+4)) * *(b+((k+4)*veclen+j)) 
                    + *(a+(i*veclen+k+5)) * *(b+((k+5)*veclen+j)) 
                    + *(a+(i*veclen+k+6)) * *(b+((k+6)*veclen+j)) 
                    + *(a+(i*veclen+k+7)) * *(b+((k+7)*veclen+j)); 
            }
        }
    }
#elif STRIP4
    const int stride = 4;

    mod = veclen % stride;

    for (i=0; i<veclen; i++) {
        for (j=0; j<veclen; j++) {
            *(c+(i*veclen+j)) = 0.0;
            for (k=0;k<mod;k++){
                *(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j)); 
            }
            for (k=mod;k<veclen;k+=stride) {
                *(c+(i*veclen+j)) += *(a+(i*veclen+k  )) * *(b+( k   *veclen+j)) 
                    + *(a+(i*veclen+k+1)) * *(b+((k+1)*veclen+j)) 
                    + *(a+(i*veclen+k+2)) * *(b+((k+2)*veclen+j)) 
                    + *(a+(i*veclen+k+3)) * *(b+((k+3)*veclen+j)); 
            }
        }
    }

#else

    // Normal Matrix Multiplication

    for (i=0; i<veclen; i++) {
        for (j=0; j<veclen; j++) {
            *(c+(i*veclen+j)) = 0.0;
            for (k=0;k<veclen;k++){
                *(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j)); 
            }
        }
    }
#endif
}
#endif

-------------- next part --------------
# Makefile to build boxcar diffusion Program 
#
# Andrew J. Pounds, Ph.D.
# Departments of Chemistry and Computer Science
# Mercer University
# Fall 2011 
#

F95 = gfortran   
CC = gcc 

debug ?= n
ifeq ($(debug), y)
    CFLAGS += -g -DDEBUG
else
    CFLAGS += -O3 
endif

ATLASLIBS = -L/usr/lib64/atlas -lblas -llapack -lf77blas -lcblas -latlas

OBJS = array.o zeromat.o walltime.o cputime.o mmm.o  \
       vvm.o 
PTHREADS = -DPTHREADS -pthread 

all: driver atlasdriver 

atlasdriver : atlasdriver.o $(OBJS)    
	$(F95) -o atlasdriver atlasdriver.o $(OBJS) $(ATLASLIBS) 

atlasdriver.o : atlasdriver.f90 array.o   
	$(F95) $(FFLAGS) -c atlasdriver.f90  

driver : driver.o $(OBJS)    
	$(F95) -o driver driver.o $(OBJS) $(PTHREADS)  

driver.o : driver.f90 array.o   
	$(F95) $(FFLAGS) -c driver.f90  

zeromat.o : zeromat.f90    
	$(F95) $(FFLAGS)  -c zeromat.f90  

array.o : array.f90
	$(F95) -c array.f90

mmm.o : mmm.c
	$(CC) $(CFLAGS) $(COPTFLAGS) $(PTHREADS) -c mmm.c

vvm.o : vvm.c
	$(CC) $(CFLAGS) -c vvm.c

# Timing Library targets 

walltime.o : walltime.c
	$(CC)  -c walltime.c

cputime.o : cputime.c
	$(CC)  -c cputime.c

lib: cputime.o walltime.o
	ar -rc liblbstime.a cputime.o walltime.o
	ranlib liblbstime.a

# Default Targets for Cleaning up the Environment
clean :
	rm *.o

pristine :
	rm *.o
	touch *.c *.f90 
	rm *.mod
	rm driver atlasdriver

ctags :
	ctags *.f90 *.c