<html>

  <head>


    <meta http-equiv="content-type" content="text/html; charset=UTF-8">

  </head>

  <body>

    <p><font face="serif">Thanks to Will for asking this question that

        required me do "deep dive" into some of the updates and changes

        to OpenMP.  In some of the old code I gave you for matrix

        multiplication I used a very simplistic parallelization scheme

        that OpenMP now has problems with parallelizing.  If you take

        time and look into reductions and using "omp for" pragmas you

        can get some really nice speedups.  For example...</font></p>

    <p><font face="serif"><br>

      </font></p>

    <p><font face="Source Code Pro for Powerline">void mmm_( int

        *threads, int *len,  double *a, double *b, double *c ){<br>

        <br>

            int i, j, k;<br>

            int veclen = *len;<br>

            double s;<br>

        <br>

        // Set the number of threads to use here<br>

        <br>

            omp_set_num_threads(*threads);<br>

        <br>

        #pragma omp parallel shared(a,b,c,veclen) private(i,j,k)

        reduction(+:s)<br>

        {<br>

        #pragma omp for schedule(static)<br>

            for (i=0; i&lt;veclen; i++) {<br>

                for (j=0; j&lt;veclen; j++) {<br>

                    *(c+(i*veclen+j)) = 0.0;<br>

                    s = 0.0;<br>

                    for (k=0;k&lt;veclen;k++){<br>

                          s += *(a+(i*veclen+k)) * *(b+(k*veclen+j));<br>

                    }<br>

                    *(c+(i*veclen+j)) = s;<br>

                }<br>

            }<br>

        }<br>

        }<br>

      </font><br>

    </p>

    <pre class="moz-signature" cols="72">-- 

Andrew J. Pounds, Ph.D.  (<a class="moz-txt-link-abbreviated" href="mailto:pounds_aj@mercer.edu">pounds_aj@mercer.edu</a>)

Professor of Chemistry and Computer Science

Director of the Computational Science Program

Mercer University,  Macon, GA 31207   (478) 301-5627

</pre>

  </body>

</html>