diff --git a/template.cu b/template.cu index febd9aa..1d55b04 100644 --- a/template.cu +++ b/template.cu @@ -40,8 +40,7 @@ float omega[numBins] = {0}; // but still need cudaMalloc on device memory: unsigned int *histogramDR_gm, *histogramDD_gm, *histogramRR_gm; -float calculateAngularDistance(float g1_ra, float g1_dec, float g2_ra, float g2_dec) { - +__device__ float calculateAngularDistance(float g1_ra, float g1_dec, float g2_ra, float g2_dec) { // turning arc minutes to degree float ra1 = g1_ra/ 60 * M_PI / 180.0; float dec1 = g1_dec/ 60 * M_PI / 180.0; @@ -50,33 +49,40 @@ float calculateAngularDistance(float g1_ra, float g1_dec, float g2_ra, float g2_ // calculate angular distance float delta_ra = ra2 - ra1; - float cos_c = sin(dec1) * sin(dec2) + cos(dec1) * cos(dec2) * cos(delta_ra); - float c = acos(cos_c); -} + float cos_c = sinf(dec1) * sinf(dec2) + cosf(dec1) * cosf(dec2) * cosf(delta_ra); + if (cos_c > 1.0) { + cos_c = 1.0; + } else if (cos_c < -1.0) { + cos_c = -1.0; + } + float c = acosf(cos_c); -__global__ void calculateHistograms(float* d_ra_real, float * d_decl_real, float* r_ra_sim, float* r_decl_sim, int* dd, int* dr, int* rr, int numD, int numR) { - int index = threadIdx.x + blockIdx.x * blockDim.x; + if (isnan(c) || isinf(c)) { + return 0.0; + } else { + return c * 180 / M_PI; + } - if (index < numD) { - for (int j = 0; j < numD; j++) { - if (j == index) continue; // Skip the case where i == j - int bin = (int)(calculateAngularDistance(d_ra_real[index], d_decl_real[index], d_ra_real[j], d_decl_real[j]) / 0.25); - atomicAdd(&dd[bin], 1); - } +} - for (int j = 0; j < numR; j++) { - int bin = (int)(calculateAngularDistance(d_ra_real[index], d_decl_real[index], r_ra_sim[j], r_decl_sim[j]) / 0.25); - atomicAdd(&dr[bin], 1); - } - } +__global__ void calculateHistograms(float* d_ra_real, float * d_decl_real, float* r_ra_sim, float* r_decl_sim, unsigned int* dd, unsigned int* dr, unsigned int* rr, int maxInputLength) { + long int index = (long int)(threadIdx.x + blockIdx.x * blockDim.x); - if (index < numR) { - for (int j = 0; j < numR; j++) { - if (j == index) continue; // Skip the case where i == j - int bin = (int)(calculateAngularDistance(r_ra_sim[index], r_decl_sim[index], r_ra_sim[j], r_decl_sim[j]) / 0.25); - atomicAdd(&rr[bin], 1); - } + if (index >= (long int)maxInputLength * maxInputLength) { + return; } + + int i = index / maxInputLength; + int j = index % maxInputLength; + + int bin_dd = (int)(calculateAngularDistance(d_ra_real[i], d_decl_real[i], d_ra_real[j],d_decl_real[j]) / 0.25); + atomicAdd(&dd[bin_dd], 1); + + int bin_dr = (int)(calculateAngularDistance(d_ra_real[i], d_decl_real[i], r_ra_sim[j],r_decl_sim[j]) / 0.25); + atomicAdd(&dr[bin_dr], 1); + + int bin_rr = (int)(calculateAngularDistance(r_ra_sim[i], r_decl_sim[i], r_ra_sim[j], r_decl_sim[j]) / 0.25); + atomicAdd(&rr[bin_rr], 1); } @@ -88,10 +94,13 @@ void calculateOmega() { } } -void printResult() { - printf("bin start/deg\t\tomega\t\thist_DD\t\thist_DR\t\thist_RR\n"); +void printResult( FILE *outfil ) { + fprintf( outfil, "bin start/deg\t\tomega\t\thist_DD\t\thist_DR\t\thist_RR\n"); for( int i = 0; i < numBins; ++i ){ - printf("%.3f\t\t%.6f\t\t%u\t\t%u\t\t%u\n", i * binWidth, omega[i], histogramDD[i], histogramDR[i], histogramRR[i] ); + if(histogramDD[i] == 0) { + break; + } + fprintf( outfil, "%.3f\t\t%.6f\t\t%u\t\t%u\t\t%u\n", i * binWidth, omega[i], histogramDD[i], histogramDR[i], histogramRR[i] ); } } @@ -140,11 +149,14 @@ int main(int argc, char *argv[]) cudaMemcpy( decl_real_gm, decl_real, NoofReal*sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( ra_sim_gm, ra_sim, NoofSim*sizeof(float), cudaMemcpyHostToDevice ); cudaMemcpy( decl_sim_gm, decl_sim, NoofSim*sizeof(float), cudaMemcpyHostToDevice ); + cudaMemset( histogramDR_gm, 0, numBins*sizeof(unsigned int) ); + cudaMemset( histogramDD_gm, 0, numBins*sizeof(unsigned int) ); + cudaMemset( histogramRR_gm, 0, numBins*sizeof(unsigned int) ); - // run the kernels on the GPU - int threadsInBlock = 256; - int blocksInGrid = ( max( NoofReal, NoofSim ) + threadsInBlock - 1) / threadsInBlock; - calculateHistograms<< blocksInGrid, threadsInBlock >>( ra_real_gm, decl_real_gm, ra_sim_gm, decl_sim_gm, histogramDD_gm, histogramDR_gm, histogramRR_gm, NoofReal, NoofSim ); + // check to see which array of coordinates are longer, use that longer length as range + const int maxInputLength = (NoofReal > NoofSim ? NoofReal : NoofSim); + noofblocks = (int)(( maxInputLength * maxInputLength + threadsperblock - 1) / threadsperblock); + calculateHistograms<<< noofblocks, threadsperblock >>>( ra_real_gm, decl_real_gm, ra_sim_gm, decl_sim_gm, histogramDD_gm, histogramDR_gm, histogramRR_gm, maxInputLength ); // copy the results back to the CPU cudaMemcpy( histogramDD, histogramDD_gm, numBins*sizeof(unsigned int), cudaMemcpyDeviceToHost ); @@ -153,7 +165,18 @@ int main(int argc, char *argv[]) // calculate omega values on the CPU calculateOmega(); - printResult(); + + outfil = fopen(argv[3], "w"); + printResult(outfil); + fclose(outfil); + + cudaFree(ra_real_gm); + cudaFree(decl_real_gm); + cudaFree(ra_sim_gm); + cudaFree(decl_sim_gm); + cudaFree(histogramDD_gm); + cudaFree(histogramDR_gm); + cudaFree(histogramRR_gm); // end timing gettimeofday(&_ttime, &_tzone);