#include #include "common.h" #define max(A,B) ( (A) > (B) ? (A) : (B) ) //actual maximum is 512, but that often won't get you maximum performance const int THREADS_PER_BLOCK = 128; //I've found that for numeric constants on the GPU, it's more reliable to define // them as const global variables static const float mass = 0.01; static const float cutoff = 0.01; static const float min_r = (cutoff/100); void checkCUDAError(const char * msg); //this is a device function, so it can only be called from within the kernel //caution: device functions are always inlined __device__ void apply_force(particle_t * particle, particle_t * neighbor) { float dx = neighbor->x - particle->x; float dy = neighbor->y - particle->y; float r2 = dx*dx + dy*dy; r2 = max(r2, min_r*min_r); float r = rsqrtf(r2); // // very simple short-range repulsive force // float coef = ( 1 - cutoff*r ) / r2; particle->ax += coef * dx; particle->ay += coef * dy; } // kernel that does the n-to-n force calculations in a naive way // remember, this same function is run across all of the gpu threads when the kernel is invoked __global__ void apply_forces_naive_kernel(particle_t * d_particles, int nbodies) { //calculate the index of this thread's body (each thread will handle just one body) int n = blockIdx.x * blockDim.x + threadIdx.x; if (n >= nbodies) return; d_particles[n].ax = 0; d_particles[n].ay = 0; int i; for (i = 0; i < nbodies; i++) { apply_force(&(d_particles[n]), &(d_particles[i])); //24 ms } d_particles[n].ax /= mass; d_particles[n].ay /= mass; } /* does calculation of forces between all the particles on the GPU (this function sets up and runs the kernel, as well as copying the data back and forth) particle_t* particles: array of particles int nbodies: number of particles */ extern "C" //makes this function visible from the other C code void apply_all_forces(particle_t * particles, int nbodies, char timing) { //pointer to particles array on the device (device == GPU) particle_t * d_particles; int num_threads, num_blocks; // kernel invocation parameters float time; cudaEvent_t cstart, cend; // for timing //allocate room for particles in global memory on the device cudaMalloc((void **)&d_particles, sizeof(particle_t)*nbodies); checkCUDAError("after cudaMalloc"); //copy particle data onto device cudaMemcpy(d_particles, particles, sizeof(particle_t)*nbodies, cudaMemcpyHostToDevice); checkCUDAError("after cudaMemcpyHostToDevice"); if (nbodies < THREADS_PER_BLOCK) { num_threads = nbodies; num_blocks = 1; } else { num_threads = THREADS_PER_BLOCK; num_blocks = nbodies / THREADS_PER_BLOCK + (nbodies % THREADS_PER_BLOCK == 0 ? 0 : 1); } // c++ style constructor, dim3 has x, y, and z components, // which default to 1 if not specified dim3 gridDim(num_blocks); dim3 blockDim(num_threads); //start timing if(timing == 'y'){ cudaEventCreate(&cstart); cudaEventCreate(&cend); cudaEventRecord(cstart,0); } //invoke kernel to do calculations apply_forces_naive_kernel <<< gridDim, blockDim >>> (d_particles, nbodies); cudaThreadSynchronize(); checkCUDAError("after kernel invocation"); //stop timing and print it out if(timing == 'y'){ cudaEventRecord(cend,0); cudaEventSynchronize(cend); cudaEventElapsedTime(&time, cstart, cend); cudaEventDestroy(cstart); cudaEventDestroy(cend); printf("naive kernel: %g ms\n", time); } //copy the data back from the device cudaMemcpy(particles, d_particles, sizeof(particle_t)*nbodies, cudaMemcpyDeviceToHost); checkCUDAError("after cudaMemcpyDeviceToHost"); //free the memory on the GPU cudaFree(d_particles); } void checkCUDAError(const char * msg) { cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } }