[CSC 435] And the pthreads...
Andrew J. Pounds
pounds_aj at mercer.edu
Sat Mar 26 13:35:07 EDT 2016
Attached is the pthreads code with a makefile I used for it. You will
note that I tried closely to follow the examples in the Pthreads text
(page 56 in the edition I have).
Again, my recommendation is that you create a branch on your babyblas
repository and stick this there for testing and reference.
--
Andrew J. Pounds, Ph.D. (pounds_aj at mercer.edu)
Professor of Chemistry and Computer Science
Mercer University, Macon, GA 31207 (478) 301-5627
http://faculty.mercer.edu/pounds_aj
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://theochem.mercer.edu/pipermail/csc435/attachments/20160326/58c245f2/attachment.html>
-------------- next part --------------
#ifdef __cplusplus
extern "C" {
#endif
void mmm_( int *threads, int *len, double *a, double *b, double*c );
#ifdef __cplusplus
}
#endif
#ifdef PTHREADS
/* P T H R E A D S S E C T I O N */
#include <stdlib.h>
#include <pthread.h>
// Define a "global" structure here
typedef struct {
int id;
int veclen;
int rowstart;
int stride;
double *A;
double *B;
double *C;
} matrix_worker;
// This is the actual thread worker
void mmmp(matrix_worker *work_orderp){
double *MA = work_orderp->A;
double *MB = work_orderp->B;
double *MC = work_orderp->C;
int veclen = work_orderp->veclen;
int rowstart = work_orderp->rowstart;
int rowstop = rowstart + work_orderp->stride;
int i, j, k;
int strip, mod;
#ifdef STRIP8
strip = 8;
mod = veclen % strip;
for (i=rowstart; i<rowstop; i++) {
for (j=0; j<veclen; j++) {
*(MC+(i*veclen+j)) = 0.0;
for (k=0;k<mod;k++) {
*(MC+(i*veclen+j)) += *(MA+(i*veclen+k)) * *(MB+(k*veclen+j));
}
for (k=mod;k<veclen;k+=strip) {
*(MC+(i*veclen+j)) += *(MA+(i*veclen+k )) * *(MB+( k *veclen+j))
+ *(MA+(i*veclen+k+1)) * *(MB+((k+1)*veclen+j))
+ *(MA+(i*veclen+k+2)) * *(MB+((k+2)*veclen+j))
+ *(MA+(i*veclen+k+3)) * *(MB+((k+3)*veclen+j));
+ *(MA+(i*veclen+k+4)) * *(MB+((k+4)*veclen+j));
+ *(MA+(i*veclen+k+5)) * *(MB+((k+5)*veclen+j));
+ *(MA+(i*veclen+k+6)) * *(MB+((k+6)*veclen+j));
+ *(MA+(i*veclen+k+7)) * *(MB+((k+7)*veclen+j));
}
}
}
#else
strip = 4;
mod = veclen % strip;
for (i=rowstart; i<rowstop; i++) {
for (j=0; j<veclen; j++) {
*(MC+(i*veclen+j)) = 0.0;
for (k=0;k<mod;k++) {
*(MC+(i*veclen+j)) += *(MA+(i*veclen+k)) * *(MB+(k*veclen+j));
}
for (k=mod;k<veclen;k+=strip) {
*(MC+(i*veclen+j)) += *(MA+(i*veclen+k )) * *(MB+( k *veclen+j))
+ *(MA+(i*veclen+k+1)) * *(MB+((k+1)*veclen+j))
+ *(MA+(i*veclen+k+2)) * *(MB+((k+2)*veclen+j))
+ *(MA+(i*veclen+k+3)) * *(MB+((k+3)*veclen+j));
}
}
}
#endif
}
void mmm_( int *threads, int *len, double *a, double *b, double *c ){
// This is the matrix multiply code that is called to orchestrate the thread workers
// void mmm_( int *len, double *a, double *b, double *c ){
int i;
int start = 0;
pthread_t task_id[*threads];
int *steps = malloc(*threads*sizeof(int));
matrix_worker *work_orderp;
// Divide the code up equally amongst all of the threads
if ( *len % *threads == 0 ) {
for (i=0;i<*threads;i++) *(steps+i) = *len / *threads;
}
else {
for (i=0;i<*threads;i++) *(steps+i) = *len / *threads;
for (i=0;i<*len % *threads;i++) *(steps+i) = *(steps+i) + 1;
}
start = 0;
for (i=0;i<*threads;i++) {
work_orderp = (matrix_worker *) malloc(sizeof(matrix_worker));
work_orderp->id = i;
work_orderp->veclen = *len;
work_orderp->rowstart = start;
work_orderp->stride = *(steps+i);
work_orderp->A = a;
work_orderp->B = b;
work_orderp->C = c;
pthread_create(&(task_id[i]), NULL, (void *) mmmp, (void *) work_orderp);
start += work_orderp->stride;
}
for (i=0; i<*threads; i++) pthread_join(task_id[i], NULL);
}
#else
/* S E R I A L C O D E */
void mmm_( int *threads, int *len, double *a, double *b, double *c ){
int i, j, k;
int veclen = *len;
int mod;
#ifdef STRIP8
const int stride = 8;
mod = veclen % stride;
for (i=0; i<veclen; i++) {
for (j=0; j<veclen; j++) {
*(c+(i*veclen+j)) = 0.0;
for (k=0;k<mod;k++){
*(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j));
}
for (k=mod;k<veclen;k+=stride) {
*(c+(i*veclen+j)) += *(a+(i*veclen+k )) * *(b+( k *veclen+j))
+ *(a+(i*veclen+k+1)) * *(b+((k+1)*veclen+j))
+ *(a+(i*veclen+k+2)) * *(b+((k+2)*veclen+j))
+ *(a+(i*veclen+k+3)) * *(b+((k+3)*veclen+j))
+ *(a+(i*veclen+k+4)) * *(b+((k+4)*veclen+j))
+ *(a+(i*veclen+k+5)) * *(b+((k+5)*veclen+j))
+ *(a+(i*veclen+k+6)) * *(b+((k+6)*veclen+j))
+ *(a+(i*veclen+k+7)) * *(b+((k+7)*veclen+j));
}
}
}
#elif STRIP4
const int stride = 4;
mod = veclen % stride;
for (i=0; i<veclen; i++) {
for (j=0; j<veclen; j++) {
*(c+(i*veclen+j)) = 0.0;
for (k=0;k<mod;k++){
*(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j));
}
for (k=mod;k<veclen;k+=stride) {
*(c+(i*veclen+j)) += *(a+(i*veclen+k )) * *(b+( k *veclen+j))
+ *(a+(i*veclen+k+1)) * *(b+((k+1)*veclen+j))
+ *(a+(i*veclen+k+2)) * *(b+((k+2)*veclen+j))
+ *(a+(i*veclen+k+3)) * *(b+((k+3)*veclen+j));
}
}
}
#else
// Normal Matrix Multiplication
for (i=0; i<veclen; i++) {
for (j=0; j<veclen; j++) {
*(c+(i*veclen+j)) = 0.0;
for (k=0;k<veclen;k++){
*(c+(i*veclen+j)) += *(a+(i*veclen+k)) * *(b+(k*veclen+j));
}
}
}
#endif
}
#endif
-------------- next part --------------
# Makefile to build boxcar diffusion Program
#
# Andrew J. Pounds, Ph.D.
# Departments of Chemistry and Computer Science
# Mercer University
# Fall 2011
#
F95 = gfortran
CC = gcc
debug ?= n
ifeq ($(debug), y)
CFLAGS += -g -DDEBUG
else
CFLAGS += -O3
endif
ATLASLIBS = -L/usr/lib64/atlas -lblas -llapack -lf77blas -lcblas -latlas
OBJS = array.o zeromat.o walltime.o cputime.o mmm.o \
vvm.o
PTHREADS = -DPTHREADS -pthread
all: driver atlasdriver
atlasdriver : atlasdriver.o $(OBJS)
$(F95) -o atlasdriver atlasdriver.o $(OBJS) $(ATLASLIBS)
atlasdriver.o : atlasdriver.f90 array.o
$(F95) $(FFLAGS) -c atlasdriver.f90
driver : driver.o $(OBJS)
$(F95) -o driver driver.o $(OBJS) $(PTHREADS)
driver.o : driver.f90 array.o
$(F95) $(FFLAGS) -c driver.f90
zeromat.o : zeromat.f90
$(F95) $(FFLAGS) -c zeromat.f90
array.o : array.f90
$(F95) -c array.f90
mmm.o : mmm.c
$(CC) $(CFLAGS) $(COPTFLAGS) $(PTHREADS) -c mmm.c
vvm.o : vvm.c
$(CC) $(CFLAGS) -c vvm.c
# Timing Library targets
walltime.o : walltime.c
$(CC) -c walltime.c
cputime.o : cputime.c
$(CC) -c cputime.c
lib: cputime.o walltime.o
ar -rc liblbstime.a cputime.o walltime.o
ranlib liblbstime.a
# Default Targets for Cleaning up the Environment
clean :
rm *.o
pristine :
rm *.o
touch *.c *.f90
rm *.mod
rm driver atlasdriver
ctags :
ctags *.f90 *.c
More information about the csc435
mailing list