#include <assert.h>
#include "common.h"

#define max(A,B)	( (A) > (B) ? (A) : (B) )

//actual maximum is 512, but that often won't get you maximum performance
const int THREADS_PER_BLOCK = 128;

//I've found that for numeric constants on the GPU, it's more reliable to define
// them as const global variables
static const float mass = 0.01;
static const float cutoff = 0.01;
static const float min_r = (cutoff/100);

void checkCUDAError(const char * msg);

//this is a device function, so it can only be called from within the kernel
//caution: device functions are always inlined

__device__ void apply_force(particle_t * particle, particle_t * neighbor) {
	float dx = neighbor->x - particle->x;
	float dy = neighbor->y - particle->y;
	float r2 = dx*dx + dy*dy;
	r2 = max(r2, min_r*min_r);
	float r = rsqrtf(r2);
	
    //
    //  very simple short-range repulsive force
    //
    float coef = ( 1 - cutoff*r ) / r2;
    particle->ax += coef * dx;
    particle->ay += coef * dy;
}


// kernel that does the n-to-n force calculations in a naive way
// remember, this same function is run across all of the gpu threads when the kernel is invoked

__global__ void apply_forces_naive_kernel(particle_t * d_particles, int nbodies) {
	//calculate the index of this thread's body (each thread will handle just one body)
	int n = blockIdx.x * blockDim.x + threadIdx.x;
	
	if (n >= nbodies)
		return;
	
	d_particles[n].ax = 0;
	d_particles[n].ay = 0;
	
	int i;
	for (i = 0; i < nbodies; i++) {
		apply_force(&(d_particles[n]), &(d_particles[i])); //24 ms
	}
	d_particles[n].ax /= mass;
	d_particles[n].ay /= mass;
}


/*
  does calculation of forces between all the particles on the GPU
  (this function sets up and runs the kernel, as well as copying the data back and forth)
	particle_t* particles: array of particles
	int nbodies: number of particles
*/
extern "C" //makes this function visible from the other C code
void apply_all_forces(particle_t * particles, int nbodies, char timing) {
	//pointer to particles array on the device (device == GPU)
	particle_t * d_particles;
	int num_threads, num_blocks;   // kernel invocation parameters
	float time;
	cudaEvent_t cstart, cend;      // for timing

	//allocate room for particles in global memory on the device	
	cudaMalloc((void **)&d_particles, sizeof(particle_t)*nbodies);
	checkCUDAError("after cudaMalloc");
	
	//copy particle data onto device
	cudaMemcpy(d_particles, particles, sizeof(particle_t)*nbodies, cudaMemcpyHostToDevice);
	checkCUDAError("after cudaMemcpyHostToDevice");
	
	if (nbodies < THREADS_PER_BLOCK) {
		num_threads = nbodies;
		num_blocks = 1;
	} else {
		num_threads = THREADS_PER_BLOCK;
		num_blocks = nbodies / THREADS_PER_BLOCK + (nbodies % THREADS_PER_BLOCK == 0 ? 0 : 1);
	}
	// c++ style constructor, dim3 has x, y, and z components,
	// which default to 1 if not specified

	dim3 gridDim(num_blocks);
	dim3 blockDim(num_threads);
	
	//start timing
	if(timing == 'y'){
	  cudaEventCreate(&cstart); cudaEventCreate(&cend);
	  cudaEventRecord(cstart,0);
	}
	
	//invoke kernel to do calculations
	apply_forces_naive_kernel <<< gridDim, blockDim >>> (d_particles, nbodies);
	cudaThreadSynchronize();
	checkCUDAError("after kernel invocation");
	
	//stop timing and print it out
	if(timing == 'y'){
	  cudaEventRecord(cend,0);
	  cudaEventSynchronize(cend);
	  cudaEventElapsedTime(&time, cstart, cend);
	  cudaEventDestroy(cstart); cudaEventDestroy(cend);
	  printf("naive kernel: %g ms\n", time);
	}
	
	//copy the data back from the device
	cudaMemcpy(particles, d_particles, sizeof(particle_t)*nbodies, cudaMemcpyDeviceToHost);
	checkCUDAError("after cudaMemcpyDeviceToHost");
	
	//free the memory on the GPU
	cudaFree(d_particles);
}

void checkCUDAError(const char * msg) {
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
		fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
		exit(EXIT_FAILURE);
	}
}