instructions.tex | searchcode

/labs/04_MatrixMul/instructions.tex

https://bitbucket.org/aterrel/2012summerssc395
LaTeX | 157 lines | 117 code | 40 blank | 0 comment | 0 complexity | 69101492adf8e8af93151a8f2f40d78a MD5 | raw file

\documentclass{article}

\usepackage{fullpage}
\usepackage{url}
\usepackage{listings}


\begin{document}

\begin{center}
{\LARGE Lab 04: Matrix Matrix Multiply}\\
{\large SSC 375/395 Summer 2012}
\end{center}

\section{Setting up the environment}

\begin{description}
\item {\bf Download the class git repo}\\
Open a terminal and execute {\tt git clone
  git@bitbucket.org:aterrel/2012summerssc395.git}.

\item {\bf Make the lab 1 device query}\\
Go to the directory 2012summerscc395/labs/04\_MatrixMul/ inside the directory
you will find the following:
\begin{itemize}
\item cuda\_helper.cu/.h - A set of helper functions for interaction with the
  device
\item matrixMul.h - set of program parameters
\item matrixMul\_gold.cpp - a cpu version to check
\item matrixMul\_kernel.cu - the gpu matrix matrix mulitply kernel
\item matmult\_runner.cu - executable to test the matrixMul\_kernel
\end{itemize}
\end{description}


\section{Writing non-shared memory matrix multiply}

The first task is to write the matrix-matrix multiply in a naive manor ($C=AB$).  In
this task each thread will compute a single entry into matrix $C$.  See the
stub below (also in {\tt matrixMul\_kernel.cu})

\begin{lstlisting}
template <int BLOCK_SIZE> __global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
{
  // Calculate the row index of the Pd element and M
  int Row = XXX;
  // Calculate the column idenx of Pd and N
  int Col = XXX;

  float Cvalue = 0;
  // each thread computes one element of the block sub-matrix
  for (int k = 0; k < XXX; ++k)
    Cvalue += A[XXX]*B[XXX];

  C[XXX] = Cvalue;

}
\end{lstlisting}

Fill in the {\tt matrixMul\_kernel.cu}; run make; run matmult\_runner.  What is
the performance of the card and what is your performance?

\section{Writing shared memory matrix multiply}

One major bottleneck as discussed in class is to load shared memory
correctly. In this version of the code each block is responsible for a block of
the matrix then loads it back into the resulting matrix.

Move the {\tt matrixMul\_kernel.cu} to {\tt matrixMul\_kernel\_simple.cu} and now
write a tiled version {\tt matrixMul\_kernel.cu}

Below is a stub to get started:

\begin{lstlisting}
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]

template <int BLOCK_SIZE> __global__ void
matrixMul( float* C, float* A, float* B, int wA, int wB)
{
    // Block index
    int bx = blockIdx.x;
    int by = blockIdx.y;

    // Thread index
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    // Index of the first sub-matrix of A processed by the block
    int aBegin = XXX;

    // Index of the last sub-matrix of A processed by the block
    int aEnd   = XXX;

    // Step size used to iterate through the sub-matrices of A
    int aStep  = BLOCK_SIZE;

    // Index of the first sub-matrix of B processed by the block
    int bBegin = BLOCK_SIZE * bx;

    // Step size used to iterate through the sub-matrices of B
    int bStep  = BLOCK_SIZE * wB;

    // Csub is used to store the element of the block sub-matrix
    // that is computed by the thread
    float Csub = 0;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (int a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) {

        // Declaration of the shared memory array As used to
        // store the sub-matrix of A
        __shared__ float As[XXX][XXX];

        // Declaration of the shared memory array Bs used to
        // store the sub-matrix of B
        __shared__ float Bs[XXX][XXX];

        // Load the matrices from device memory
        // to shared memory; each thread loads
        // one element of each matrix
        AS(ty, tx) = A[XXX];
        BS(ty, tx) = B[XXX];

        // Synchronize to make sure the matrices are loaded
        __syncthreads();

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
#pragma unroll
        for (int k = 0; k < BLOCK_SIZE; ++k)
            Csub += AS(XXX, XXX) * BS(XXX, XXX);

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }

    // Write the block sub-matrix to device memory;
    // each thread writes one element
    int c = XXX;
    C[XXX] = Csub;
}
\end{lstlisting}

run make; run matmult\_runner.  What is the performance of the card and what is
your performance?



\end{document}