My first CUDA program

CUDA is an exciting tool for accelerating current models. Here I am applying it to a simple water balance model.

What a water balance model does can be simply described by pouring water into/extracting water out from a water bucket. Here “pouring water into the bucket” can be in the form of precipitation, or irrigation, etc. “extracting water out from bucket” can happen in the forms of evaporation, transpiration, runoff process, etc. This makes the calculation framework ideal at CUDA core scale, assuming that I am now involving too complicated presentation of these processes. It is notable that modern hydrological models do contain sophisticated math presentation of these processes, but for now I am not bothering my CUDA threads with them.

My platform is VS2013 + CUDA6.5. Some configuration might be different on a Linux platform. Here is the program:

 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "stdlib.h"
#include "math.h"
#include "ctime.h"

cudaError_t wtbWithCuda(float *water, float *p, float *t, int size);

__global__ void wtbKernel(float *water, float *p, float *t){
  int i = blockIdx.x;
  water[i] = water[i] + p[i] - t[i];
}

int main(int argc, char *argv[]){
  FILE *fistate, *fforcing;
  float *water_data, *p_data, *t_data;
  int i, j, cellN;
  errno_t errorCode;
  time_t begin_gpu, end_gpu, begin_cpu, end_cpu;
  double gputime, cputime;

  /* Usage */
  if (argc != 3) {
    fprintf(stdout, "Usage: %s \n", argv[0]);
    fprintf(stdout, " List of initial water condition, 1st row is total cell number\n");
    fprintf(stdout, " Forcing file, each row: p t\n");
    exit(0);
  }

  errorCode = fopen_s(&fistate, argv[1], "r");
  if (errorCode != 0) {
    fprintf(stderr, "%s: ERROR: Unable to open %s\n", argv[0], argv[1]);
    exit(1);
  }

  fscanf_s(fistate, "%d", &cellN);
  if ((water_data = (float*)calloc(cellN, sizeof(float))) == NULL) {
    fprintf(stderr, "argv[0]: ERROR: cannot allocate sufficient memory for water_data array\n", argv[0]);
    exit(1);
  }

  for (i = 0; i < cellN; i++) {
    fscanf_s(fistate, "%f", &(water_data[i]));
  }

  fclose(fistate);

  errorCode = fopen_s(&fforcing, argv[2], "r");
  if (errorCode != 0) {
    fprintf(stderr, "%s: ERROR: Unable to open %s\n", argv[0], argv[2]);
    exit(1);
  }

  if ((p_data = (float*)calloc(cellN, sizeof(float))) == NULL) {
    fprintf(stderr, "argv[0]: ERROR: cannot allocate sufficient memory for p_data array\n", argv[0]);
    exit(1);
  }
  if ((t_data = (float*)calloc(cellN, sizeof(float))) == NULL) {
    fprintf(stderr, "argv[0]: ERROR: cannot allocate sufficient memory for t_data array\n", argv[0]);
    exit(1);
  }

  for (i = 0; i < cellN; i++) {
    fscanf_s(fforcing, "%f %f", &(p_data[i]), &(t_data[i]));
  }

  fclose(fforcing);

  begin_gpu = clock();
  cudaError_t cudaStatus = wtbWithCuda(water_data, p_data, t_data, cellN);
  end_gpu = clock();
  gputime = (double)(end_gpu - begin_gpu) / CLOCKS_PER_SEC;

  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
  }

  // cudaDeviceReset must be called before exiting in order for profiling and
  // tracing tools such as Nsight and Visual Profiler to show complete traces.
  cudaStatus = cudaDeviceReset();
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
  }

  /* Output results */
  for (i = 0; i < cellN; i++) {
    fprintf(stdout, "%.4f\n", water_data[i]);
  }
  fprintf(stderr, "GPU calculation done\n");
  return 0;

}

cudaError_t wtbWithCuda(float *water, float *p, float *t, int size){
  float *dev_a = 0;
  float *dev_p = 0;
  float *dev_t = 0;
  int i;
  cudaError_t cudaStatus;

  // Choose which GPU to run on, change this on a multi-GPU system.
  cudaStatus = cudaSetDevice(0);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
    goto Error;
  }

  // Allocate GPU buffers for three vectors.
  cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(float));
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
  }

  cudaStatus = cudaMalloc((void**)&dev_p, size * sizeof(float));
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
  }

  cudaStatus = cudaMalloc((void**)&dev_t, size * sizeof(float));
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
  }

  // Copy input vectors from host memory to GPU buffers.
  cudaStatus = cudaMemcpy(dev_a, water, size * sizeof(float), cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
  }

  cudaStatus = cudaMemcpy(dev_p, p, size * sizeof(float), cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
  }

  cudaStatus = cudaMemcpy(dev_t, t, size * sizeof(float), cudaMemcpyHostToDevice);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
  }

  // Launch a kernel on the GPU with one thread for each element.
  wtbKernel<<>>(dev_a, dev_p, dev_t);

  // Check for any errors launching the kernel
  cudaStatus = cudaGetLastError();
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
  }

  // cudaDeviceSynchronize waits for the kernel to finish, and returns
  // any errors encountered during the launch.
  cudaStatus = cudaDeviceSynchronize();
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
  }

  // Copy output vector from GPU buffer to host memory.
  cudaStatus = cudaMemcpy(water, dev_a, size * sizeof(float), cudaMemcpyDeviceToHost);
  if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
  }

  Error:
  cudaFree(dev_a);
  cudaFree(dev_p);
  cudaFree(dev_t);

  return cudaStatus;
}

Leave a comment