Example MPI C / CUDA device query program

Example MPI C / CUDA. This includes instructions to compile and run the application.

How to Compile

1. Load the CUDA toolkit and OpenMPI.

module add cuda32/toolkit openmpi/gcc

2. Call nvcc to compile:

nvcc cuda-mpi4.cu -I$CUDA_SDK/shared/inc -I$MPI_HOME/include -L$MPI_HOME/lib64/ -lmpi -arch sm_13  -o cuda-mpi.exe

Example run

To run, simply call it using MPIrun. Please note psid 0 is not used for this simple example, rank 0 will only collect the data, ranks 1 and up will do the actual CUDA detection.

mpirun -np 5 -machinefile nodes ./cuda-mpi.exe

nodes file:
master
node001
node002
node003
node004

Sample output

  We have 4 processors
  Spawning from gpu01 
  CUDA MPI
 
 
 
 
  Probing nodes...
     Node        Psid  CUDA Cards (devID)
     ----------- ----- ---- ----------
+ gpu01           1    1 GeForce GT 430 (0) 
+ gpu02           2    1 GeForce GT 430 (0) 
- node001         3    0 NONE

As the output shows,node001 has 2 CUDA cards, node002 and node004 have 1 CUDA card and node003 has none. Nodes marked OK contain at least 1 CUDA capable card.

Please note that you will need to spawn 1 process per node, as each process will assume it is the only one on the node. This process will iterate the local CUDA cards. Spawning 4 processes on one node for instance, will return 4 times the same result.

Example jobscript

#!/bin/sh
#$ -N CUDA-MPI
#$ -S /bin/bash
#$ -pe openmpi 8
#$ -l h_rt=08:00:00
. /etc/profile.d/modules.sh
 
module add openmpi/gcc cuda32/toolkit
 
mpirun -np $NSLOTS -machinefile $TMPDIR/machines ./cuda-mpi.exe

MPI CUDA Code

#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "shrUtils.h"
// utilities and system includes
// CUDA-C includes
#include "cuda.h"
#define BUFSIZE 256
#define TAG 0
 
 
int devCount;
int myid;
int ihavecuda;
int nodes[256];
int nocuda[256];
int deviceselector=0;
 
 
 
int main(int argc, char *argv[])
 {
        char idstr[256];
        char idstr2[256];
        char buff[BUFSIZE];
        int i;
        int numprocs, rank, namelen;
        char processor_name[MPI_MAX_PROCESSOR_NAME];
        freopen("/dev/null", "w", stderr); /* Hide errors from nodes with no CUDA cards */
        MPI_Status stat;
        MPI_Init(&argc,&argv);
        MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        MPI_Get_processor_name(processor_name, &namelen);
        MPI_Comm_rank(MPI_COMM_WORLD,&myid);
        if (myid == 0)
        {
                printf("  We have %d processors\n", numprocs);
                printf("  Spawning from %s \n", processor_name);
                printf("  CUDA MPI\n");
                printf("\n");
                for(i=1; i<numprocs;i++)
                {
                        buff[0]='\0';
                        MPI_Send(buff, BUFSIZE, MPI_CHAR, i, TAG, MPI_COMM_WORLD);
                }
                printf("\n\n\n");
                printf("  Probing nodes...\n");
                printf("     Node        Psid  CUDA Cards (devID)\n");
                printf("     ----------- ----- ---- ----------\n");
                for(i=1; i<numprocs;i++)
                {
                        MPI_Recv(buff, BUFSIZE, MPI_CHAR, i, TAG, MPI_COMM_WORLD, &stat);
                        printf("%s\n", buff);
                }
                printf("\n");
                MPI_Finalize(); 
        }
        else
        {
                MPI_Recv(buff, BUFSIZE, MPI_CHAR, 0, TAG, MPI_COMM_WORLD, &stat);
                MPI_Get_processor_name(processor_name, &namelen);
                cudaGetDeviceCount(&devCount);
                buff[0]='\0';
                idstr[0]='\0';
                if (devCount == 0) {
                        sprintf(idstr,"- %-11s %5d %4d NONE", processor_name, rank, devCount);
                        ihavecuda=0;
                }else{
                        ihavecuda=1;
                        if (devCount >= 1){
                                sprintf(idstr, "+ %-11s %5d %4d", processor_name, rank, devCount);
                                idstr2[0]='\0';
                                for (int i = 0; i < devCount; ++i)
                                {
                                        cudaDeviceProp devProp;
                                        cudaGetDeviceProperties(&devProp, i);
                                        sprintf(idstr2, " %s (%d) ", devProp.name, i);
                                        strncat(idstr,idstr2,BUFSIZE);
                                }
                        }
                        else
                        {
                                        cudaDeviceProp devProp;
                                        cudaGetDeviceProperties(&devProp, i);
                                        sprintf(idstr, "%-11s %5d %4d %s", processor_name, rank, devCount, devProp.name);
                        }
                }
                strncat(buff, idstr, BUFSIZE);
                MPI_Send(buff, BUFSIZE, MPI_CHAR, 0, TAG, MPI_COMM_WORLD);
        }
        MPI_Finalize();
        return 0;
}

Copyright 2002-2014 ClusterVision BV