bin/0000755000175600017620000000000011544627144010021 5ustar sjpsjpcuda/0000755000175600017620000000000011544626143010163 5ustar sjpsjpcuda/kernels/0000755000175600017620000000000011544624576011636 5ustar sjpsjpcuda/kernels/exchange/0000755000175600017620000000000011544624737013417 5ustar sjpsjpcuda/kernels/jacld/0000755000175600017620000000000011544624750012705 5ustar sjpsjpcuda/kernels/buts/0000755000175600017620000000000011544624625012606 5ustar sjpsjpcuda/kernels/blts/0000755000175600017620000000000011544624612012571 5ustar sjpsjpcuda/kernels/jacu/0000755000175600017620000000000011544624760012553 5ustar sjpsjpcuda/kernels/rhs/0000755000175600017620000000000011544624775012433 5ustar sjpsjpcuda/kernels/rhs/zeta/0000755000175600017620000000000011544625022013360 5ustar sjpsjpcuda/kernels/rhs/eta/0000755000175600017620000000000011544625014013167 5ustar sjpsjpcuda/kernels/rhs/xi/0000755000175600017620000000000011544625004013035 5ustar sjpsjpheaders/0000755000175600017620000000000011544626777010677 5ustar sjpsjpcuda/device_functions.cuh0000644000175600017620000000311711544626073014217 0ustar sjpsjp/** * Calculates the flat index for [k][j][i][m]. */ __device__ int flat_index(int k, int j, int i, int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. */ __device__ int hyperplane_index(int k, int j, int i, int m, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { int offset = 0; // Calculate thread id. offset += thread_map_d[(j * (isiz1 + 4)) + i]; // Jump enough blocks. int block_depth = k / kblock; int depth = k - (kblock * block_depth); offset += block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock); // Jump to the right wavefront. offset += wave_offset_3d[i + j + depth]; // Update thread_offset. if ( (i + j + depth) >= kblock - 1 ) { offset = offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; } /** * Calculate the tiled index for [k][j][i][m]. */ __device__ int tiled_index(int k, int j, int i, int m) { int offset = 0; // Add block offset. int block_i = (i / rhsblock_x); int block_j = (j / rhsblock_y); int block_id = (block_j * rhsgrid_x) + block_i; offset += block_id * (rhsblock_x * rhsblock_y * isiz3); // Add thread offset. int thread_i = i - (block_i * rhsblock_x); int thread_j = j - (block_j * rhsblock_y); offset += (thread_j * rhsblock_x) + thread_i; // Add k offset. offset += k * (rhsblock_x * rhsblock_y); // Add angle offset. offset += m * (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); // Return. return offset; } cuda/kernels.cuh0000644000175600017620000000523711544625075012341 0ustar sjpsjp#include "size.h" #include "debug.cuh" /*** * CONSTANTS * ***/ __constant__ int ist_d, iend_d, jst_d, jend_d; __constant__ int nx_d, ny_d, nz_d; __constant__ fp_type tx1_d, tx2_d, tx3_d, ty1_d, ty2_d, ty3_d, tz1_d, tz2_d, tz3_d; __constant__ fp_type dx1_d, dx2_d, dx3_d, dx4_d, dx5_d, dy1_d, dy2_d, dy3_d, dy4_d, dy5_d, dz1_d, dz2_d, dz3_d, dz4_d, dz5_d; __constant__ fp_type omega_d; __constant__ fp_type dt_d; __constant__ fp_type tmp_d; __constant__ fp_type dssp_d; /*** * DEVICE FUNCTIONS * ***/ #include "device_functions.cuh" /*** * OTHER KERNELS * ***/ // Simple kernels to replace SSOR iteration and variable update. #include "kernels/preprocessing.cuh" #include "kernels/postprocessing.cuh" // jacld and jacu kernels #include "kernels/jacld/jacld_a.cuh" #include "kernels/jacld/jacld_b.cuh" #include "kernels/jacld/jacld_c.cuh" #include "kernels/jacld/jacld_d.cuh" #include "kernels/jacu/jacu_a.cuh" #include "kernels/jacu/jacu_b.cuh" #include "kernels/jacu/jacu_c.cuh" #include "kernels/jacu/jacu_d.cuh" // blts and buts kernels. #include "kernels/blts/blts.cuh" #include "kernels/buts/buts.cuh" // Kernels for memory rearrangement. #include "kernels/data_rearrangement.cuh" #include "kernels/exchange/exchange_1_recv.cuh" #include "kernels/exchange/exchange_1_send.cuh" #include "kernels/exchange/exchange_3_recv.cuh" #include "kernels/exchange/exchange_3_send.cuh" // Kernels for rhs. #include "kernels/rhs/rhs_setup.cuh" // Kernels for xi-direction rhs. #include "kernels/rhs/xi/rhs_xi1.cuh" #include "kernels/rhs/xi/rhs_xi2.cuh" #include "kernels/rhs/xi/rhs_xi3.cuh" #include "kernels/rhs/xi/rhs_xi4.cuh" #include "kernels/rhs/xi/rhs_xi_dissipation.cuh" // Kernels for eta-direction rhs. #include "kernels/rhs/eta/rhs_eta1.cuh" #include "kernels/rhs/eta/rhs_eta2.cuh" #include "kernels/rhs/eta/rhs_eta3.cuh" #include "kernels/rhs/eta/rhs_eta4.cuh" #include "kernels/rhs/eta/rhs_eta_dissipation.cuh" // Kernels for zeta-direction rhs. #include "kernels/rhs/zeta/rhs_zeta1.cuh" #include "kernels/rhs/zeta/rhs_zeta2.cuh" #include "kernels/rhs/zeta/rhs_zeta3.cuh" #include "kernels/rhs/zeta/rhs_zeta4.cuh" #include "kernels/rhs/zeta/rhs_zeta_dissipation.cuh" // Kernel for l2norm. #include "kernels/l2norm.cuh" cuda/util_cuda.cu0000644000175600017620000001756611522517235012501 0ustar sjpsjp#include "applu_cuda.h" /** * Allocate all of the CUDA buffers. */ void allocate_buffers(luBuffers* buffers) { // // CUDAMALLOC CALLS // // Solution arrays. cudaMalloc((void **) &buffers->u, max_buffer_size); cudaMalloc((void **) &buffers->rsd, max_buffer_size); cudaMalloc((void **) &buffers->flux, max_buffer_size); cudaMalloc((void **) &buffers->frct, max_buffer_size); // Reduction array. cudaMalloc((void **) &buffers->sum, 5 * sizeof(fp_type)); // Rearrangement buffer. cudaMalloc((void **) &buffers->rearrangement, max_buffer_size); // Lookup tables. cudaMalloc((void **) &buffers->columns, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); cudaMalloc((void **) &buffers->rows, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); cudaMalloc((void **) &buffers->wave2d_offsets, ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int)); cudaMalloc((void **) &buffers->wave3d_offsets, ((isiz2 + 4) + (isiz1 + 4) + kblock - 1) * sizeof(int)); cudaMalloc((void **) &buffers->thread_map, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); // Ex1 cudaMalloc((void **) &buffers->ibuffer, kblock * 5 * (iend - ist + 1) * sizeof(fp_type)); cudaMalloc((void **) &buffers->jbuffer, kblock * 5 * (jend - jst + 1) * sizeof(fp_type)); // Ex3 cudaMalloc((void **) &buffers->buf, 10 * isiz3 * isiz2 * sizeof(fp_type)); cudaMalloc((void **) &buffers->buf1, 10 * isiz3 * isiz2 * sizeof(fp_type)); CUT_CHECK_ERROR("Error in a cudaMalloc: "); // // CUDAMEMSET CALLS // // Solution arrays. cudaMemset(buffers->u, 0, max_buffer_size); cudaMemset(buffers->rsd, 0, max_buffer_size); cudaMemset(buffers->flux, 0, max_buffer_size); cudaMemset(buffers->frct, 0, max_buffer_size); // Reduction array. cudaMemset(buffers->sum, 0, 5 * sizeof(fp_type)); // Rearrangement buffer. cudaMemset(buffers->rearrangement, 0, max_buffer_size); // Lookup tables. cudaMemset(buffers->columns, 0, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); cudaMemset(buffers->rows, 0, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); cudaMemset(buffers->wave2d_offsets, 0, ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int)); cudaMemset(buffers->wave3d_offsets, 0, ((isiz2 + 4) + (isiz1 + 4) + kblock - 1) * sizeof(int)); cudaMemset(buffers->thread_map, 0, (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); // Ex1 cudaMemset(buffers->ibuffer, 0, kblock * 5 * (iend - ist + 1) * sizeof(fp_type)); cudaMemset(buffers->jbuffer, 0, kblock * 5 * (jend - jst + 1) * sizeof(fp_type)); // Ex3 cudaMemset(buffers->buf, 0, 10 * isiz3 * isiz2 * sizeof(fp_type)); cudaMemset(buffers->buf1, 0, 10 * isiz3 * isiz2 * sizeof(fp_type)); CUT_CHECK_ERROR("Error in a cudaMemset: "); } /** * Free all of the CUDA buffers. */ void free_buffers(luBuffers* buffers) { cudaFree(buffers->u); cudaFree(buffers->rsd); cudaFree(buffers->frct); cudaFree(buffers->flux); cudaFree(buffers->sum); cudaFree(buffers->rearrangement); cudaFree(buffers->columns); cudaFree(buffers->rows); cudaFree(buffers->wave2d_offsets); cudaFree(buffers->wave3d_offsets); cudaFree(buffers->thread_map); cudaFree(buffers->ibuffer); cudaFree(buffers->jbuffer); cudaFree(buffers->buf); cudaFree(buffers->buf1); } /** * Prepare the lookup tables (and transfer them to the GPU). */ void prepare_lookup_tables(int* wave2d_offsets, int* wave3d_offsets, int* columns, int* rows, int* thread_map, luBuffers* buffers) { int wavefront = 0; int counter = 0; // Calculate the 2D offsets. for (wavefront = 0; wavefront < ( (isiz1 + 4) + (isiz2 + 4) ); wavefront++) { wave2d_offsets[wavefront] = counter; int i, j; // Note: Has to be for i, for j in order to maintain counting order (from bottom left). for (i = 0; i < (isiz1 + 4); i++) { // No point checking if this is true. if (i > wavefront) { break; } j = wavefront - i; if (j < (isiz2 + 4)) { thread_map[(j * (isiz1 + 4)) + i] = counter; columns[counter] = i; rows[counter] = j; counter++; } } } wavefront = 0; counter = 0; // Calculate the 3D offsets. for (wavefront = 0; wavefront < ( (isiz1 + 4) + (isiz2 + 4) + kblock - 1 ); wavefront++) { wave3d_offsets[wavefront] = counter; int k, j, i; for (k = 0; k < kblock; k++) { // No point checking if this is true. if (k > wavefront) { break; } for (j = 0; j < (isiz2 + 4); j++) { // Also no point checking if this is true. if (j + k > wavefront) { break; } i = wavefront - (j + k); if (i < (isiz1 + 4)) { counter++; } } } } // Copy all of the lookup tables to the GPU. cudaMemcpy(buffers->wave2d_offsets, wave2d_offsets, ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(buffers->wave3d_offsets, wave3d_offsets, ((isiz2 + 4) + (isiz1 + 4) + kblock - 1) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(buffers->columns, columns, (isiz2 + 4) * (isiz1 + 4) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(buffers->rows, rows, (isiz2 + 4) * (isiz1 + 4) * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(buffers->thread_map, thread_map, (isiz2 + 4) * (isiz1 + 4) * sizeof(int), cudaMemcpyHostToDevice); } /** * Print device and application configuration information. */ void print_gpu_info() { cudaDeviceProp pdev; cudaGetDeviceProperties(&pdev, 0); cudaThreadSynchronize(); printf(" CUDA Information:\n"); printf(" Version: %d.%d.\n", pdev.major, pdev.minor); printf(" Multiprocessors: %d.\n", pdev.multiProcessorCount); printf(" Clock Rate: %d.\n", pdev.clockRate); printf(" Wave Grid Size: %d x %d x %d.\n", waveGrid.x, waveGrid.y, waveGrid.z); printf(" Wave Block Size: %d x %d x %d.\n", waveBlock.x, waveBlock.y, waveBlock.z); printf(" RHS Grid Size: %d x %d x %d.\n", rhsGrid.x, rhsGrid.y, rhsGrid.z); printf(" RHS Block Size: %d x %d x %d.\n", rhsBlock.x, rhsBlock.y, rhsBlock.z); printf(" k-blocking factor of %d.\n", kblock); } /** * Move flat to tiled. */ void flat_to_tiled(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers) { // Initially set the rearrangement buffer to 0. //cudaMemset(rearrangement, 0, max_buffer_size); // Permute the data in buffer and store it in rearrangement. flat_to_tiled_kernel <<< rhsGrid, rhsBlock >>> (buffer, rearrangement); // Copy rearrangement over the original buffer. //cudaMemcpy(buffer, rearrangement, max_buffer_size, cudaMemcpyDeviceToDevice); } /** * Move tiled to flat. */ void tiled_to_flat(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers) { // Initially set the rearrangement buffer to 0. //cudaMemset(rearrangement, 0, max_buffer_size); // Permute the data in buffer and store it in rearrangement. tiled_to_flat_kernel <<< rhsGrid, rhsBlock >>> (buffer, rearrangement); // Copy rearrangement over the original buffer. //cudaMemcpy(buffer, rearrangement, max_buffer_size, cudaMemcpyDeviceToDevice); } /** * Move tiled to hyperplane. */ void tiled_to_hyperplane(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers) { // Initially set the rearrangement buffer to 0. //cudaMemset(rearrangement, 0, max_buffer_size); // Permute the data in buffer and store it in rearrangement. tiled_to_hyperplane_kernel <<< rearGrid, rearBlock >>> (buffer, rearrangement, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Copy rearrangement over the original buffer. //cudaMemcpy(buffer, rearrangement, max_buffer_size, cudaMemcpyDeviceToDevice); } /** * Move hyperplane to tiled. */ void hyperplane_to_tiled(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers) { // Initially set the rearrangement buffer to 0. //cudaMemset(rearrangement, 0, max_buffer_size); // Permute the data in buffer and store it in rearrangement. hyperplane_to_tiled_kernel <<< rearGrid, rearBlock >>> (buffer, rearrangement, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Copy rearrangement over the original buffer. //cudaMemcpy(buffer, rearrangement, max_buffer_size, cudaMemcpyDeviceToDevice); } cuda/ssor_cuda.cu0000644000175600017620000003144511536140765012507 0ustar sjpsjp#include #include #include #include "mpinpb.h" #include "applu.h" #include "kernels.cuh" // Need to tell the compiler this is C code. extern "C" { // Include some headers. #include "applu_cuda.h" #include "timers.h" /** * Global variables for this class; shared between all GPU wrapper functions. */ dim3 waveGrid, waveBlock; dim3 rhsGrid, rhsBlock; dim3 rearGrid, rearBlock; int max_buffer_size; // Include the other GPU functions. #include "util_cuda.cu" #define exblock 64 #include "exchange_1.cu" #include "exchange_3.cu" #include "preprocessing.cu" #include "postprocessing.cu" #include "blts.cu" #include "buts.cu" #include "rhs.cu" #include "l2norm.cu" /** * Dirty hack for swapping pointers. */ inline void swap_pointers(fp_type** x, fp_type** y) { fp_type* tmp = *x; *x = *y; *y = tmp; } /** * Performs pseudo-time stepping SSOR iterations for given nonlinear pde's. * Carries out work (jacld, blts, jacu, buts) on CUDA-capable GPUs. */ void ssor_gpu(int niter) { /** * Local variables. */ int k; int istep; fp_type tmp; fp_type delunm[5]; root = 0; // Block and grid for wavefront methods. waveGrid.x = gpugrid_x; waveGrid.y = 1; waveGrid.z = 1; waveBlock.x = gpublock_x; waveBlock.y = 1; waveBlock.z = 1; // Block and grid for embarassingly parallel methods. rhsGrid.x = rhsgrid_x; rhsGrid.y = rhsgrid_y; rhsGrid.z = 1; rhsBlock.x = rhsblock_x; rhsBlock.y = rhsblock_y; rhsBlock.z = 1; // Block and grid for rearrangement step. rearGrid.x = rhsgrid_x; rearGrid.y = rhsgrid_y * isiz3; rearGrid.z = 1; rearBlock.x = rhsblock_x; rearBlock.y = rhsblock_y; rearBlock.z = 1; // Calculate the size of the biggest array we need. int flat_size = isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type); int hyper_size = problem_height * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type); int tiled_size = problem_height * (rhsblock_x * rhsgrid_x) * (rhsblock_y * rhsgrid_y) * 5 * sizeof(fp_type); if (flat_size > hyper_size) { max_buffer_size = flat_size; } else { max_buffer_size = hyper_size; } if (tiled_size > max_buffer_size) { max_buffer_size = tiled_size; } printf(" Buffer Size = %d\n", max_buffer_size); // Tell all functions that we are more interested in cache than smem. cudaFuncSetCacheConfig(blts_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(buts_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_setup_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_xi2_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_xi4_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_eta2_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_eta4_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_zeta1_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_zeta2_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_zeta3_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_zeta4_kernel, cudaFuncCachePreferL1); cudaFuncSetCacheConfig(rhs_zeta_dissipation_kernel, cudaFuncCachePreferL1); // Get some GPU information in the first iteration. if (niter == 1 && id == 0) { print_gpu_info(); } // Copy across constants. cudaMemcpyToSymbol("ist_d", &ist, sizeof(int)); cudaMemcpyToSymbol("iend_d", &iend, sizeof(int)); cudaMemcpyToSymbol("jst_d", &jst, sizeof(int)); cudaMemcpyToSymbol("jend_d", &jend, sizeof(int)); cudaMemcpyToSymbol("nx_d", &nx, sizeof(int)); cudaMemcpyToSymbol("ny_d", &ny, sizeof(int)); cudaMemcpyToSymbol("nz_d", &nz, sizeof(int)); cudaMemcpyToSymbol("dt_d", &dt, sizeof(fp_type)); cudaMemcpyToSymbol("tx1_d", &tx1, sizeof(fp_type)); cudaMemcpyToSymbol("tx2_d", &tx2, sizeof(fp_type)); cudaMemcpyToSymbol("tx3_d", &tx3, sizeof(fp_type)); cudaMemcpyToSymbol("ty1_d", &ty1, sizeof(fp_type)); cudaMemcpyToSymbol("ty2_d", &ty2, sizeof(fp_type)); cudaMemcpyToSymbol("ty3_d", &ty3, sizeof(fp_type)); cudaMemcpyToSymbol("tz1_d", &tz1, sizeof(fp_type)); cudaMemcpyToSymbol("tz2_d", &tz2, sizeof(fp_type)); cudaMemcpyToSymbol("tz3_d", &tz3, sizeof(fp_type)); cudaMemcpyToSymbol("dx1_d", &dx1, sizeof(fp_type)); cudaMemcpyToSymbol("dx2_d", &dx2, sizeof(fp_type)); cudaMemcpyToSymbol("dx3_d", &dx3, sizeof(fp_type)); cudaMemcpyToSymbol("dx4_d", &dx4, sizeof(fp_type)); cudaMemcpyToSymbol("dx5_d", &dx5, sizeof(fp_type)); cudaMemcpyToSymbol("dy1_d", &dy1, sizeof(fp_type)); cudaMemcpyToSymbol("dy2_d", &dy2, sizeof(fp_type)); cudaMemcpyToSymbol("dy3_d", &dy3, sizeof(fp_type)); cudaMemcpyToSymbol("dy4_d", &dy4, sizeof(fp_type)); cudaMemcpyToSymbol("dy5_d", &dy5, sizeof(fp_type)); cudaMemcpyToSymbol("dz1_d", &dz1, sizeof(fp_type)); cudaMemcpyToSymbol("dz2_d", &dz2, sizeof(fp_type)); cudaMemcpyToSymbol("dz3_d", &dz3, sizeof(fp_type)); cudaMemcpyToSymbol("dz4_d", &dz4, sizeof(fp_type)); cudaMemcpyToSymbol("dz5_d", &dz5, sizeof(fp_type)); cudaMemcpyToSymbol("omega_d", &omega, sizeof(fp_type)); cudaMemcpyToSymbol("dssp_d", &dssp, sizeof(fp_type)); CUT_CHECK_ERROR("Error in a cudaMemcpyToSymbol.\n"); // Allocate all of the buffers. luBuffers buffers; allocate_buffers(&buffers); // Compute wavefront offsets etc. int* wave2d_offsets = (int*) malloc( ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int) ); int* wave3d_offsets = (int*) malloc( ((isiz2 + 4) + (isiz1 + 4) + kblock - 1) * sizeof(int) ); int* thread_map = (int*) malloc( ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int) ); int* columns = (int*) malloc ( ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int) ); int* rows = (int*) malloc ( ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int) ); memset(wave2d_offsets, 0, ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int)); memset(wave3d_offsets, 0, ((isiz2 + 4) + (isiz1 + 4) + kblock - 1) * sizeof(int)); memset(thread_map, 0, ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int)); memset(columns, 0, ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int)); memset(rows, 0, ((isiz2 + 4) * (isiz1 + 4)) * sizeof(int)); prepare_lookup_tables(wave2d_offsets, wave3d_offsets, columns, rows, thread_map, &buffers); // Begin pseudo-time stepping iterations. const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; tmp = fpone / ( omega * ( fptwo - omega ) ); cudaMemcpyToSymbol("tmp_d", &tmp, sizeof(fp_type)); CUT_CHECK_ERROR("Error in cudaMemcpyToSymbol for tmp.\n"); // Put all the data on the card. cudaMemcpy(buffers.u, u_flat, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); cudaMemcpy(buffers.rsd, rsd_flat, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); cudaMemcpy(buffers.frct, frct_flat, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); cudaMemcpy(buffers.flux, flux_flat, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); // Make the data tiled. flat_to_tiled(buffers.u, buffers.rearrangement, &buffers); swap_pointers(&buffers.u, &buffers.rearrangement); flat_to_tiled(buffers.rsd, buffers.rearrangement, &buffers); swap_pointers(&buffers.rsd, &buffers.rearrangement); flat_to_tiled(buffers.frct, buffers.rearrangement, &buffers); swap_pointers(&buffers.frct, &buffers.rearrangement); flat_to_tiled(buffers.flux, buffers.rearrangement, &buffers); swap_pointers(&buffers.flux, &buffers.rearrangement); cudaThreadSynchronize(); CUT_CHECK_ERROR("Error in flat_to_tiled.\n"); // Compute the steady-state residuals. rhs_gpu(&buffers); // Compute the L2 norms of newton iteration residuals. l2norm_gpu(rsdnm, &buffers); MPI_Barrier(MPI_COMM_WORLD); // Make sure all aggregate timers are initialised. timer_reset(); // Start the timer for the ssor loop. cudaThreadSynchronize(); timer_start(0); // Start the timestep loop - do for niter times. for (istep = 1; istep <= niter; istep++) { if (id == 0) { if (istep % 20 == 0 || istep == itmax || istep == 1) { if (niter > 1) { printf(" Pseudo-time SSOR iteration no. = %d.\n", istep); } } } // Perform SSOR iteration. preprocessing_gpu(&buffers); timer_start(9); // Move to the hyperplane layout. tiled_to_hyperplane(buffers.u, buffers.rearrangement, &buffers); swap_pointers(&buffers.u, &buffers.rearrangement); tiled_to_hyperplane(buffers.rsd, buffers.rearrangement, &buffers); swap_pointers(&buffers.rsd, &buffers.rearrangement); CUT_CHECK_ERROR(" Error in tiled_to_hyperplane.\n"); cudaThreadSynchronize(); timer_stop(9); // For each tile in k-dimension (from bottom to top). // Perform the lower triangular solution. for (k = 0; k < nz; k += kblock) { // Receive data from north and west. exchange_1_gpu(k, 0, &buffers); timer_start(3); blts_gpu(k, &buffers); timer_stop(3); // Send data south and east. exchange_1_gpu(k + kblock - 1, 2, &buffers); } // For each tile in k-dimension (from top to bottom). // Perform the upper triangular solution. for (k = problem_height - 1; k >= 0; k -= kblock) { // Receive data from south and east. exchange_1_gpu(k, 1, &buffers); timer_start(5); buts_gpu(k, &buffers); timer_stop(5); // Receive data from south and east. exchange_1_gpu(k - kblock + 1, 3, &buffers); } timer_start(9); // Move to the tiled memory layout. hyperplane_to_tiled(buffers.u, buffers.rearrangement, &buffers); swap_pointers(&buffers.u, &buffers.rearrangement); hyperplane_to_tiled(buffers.rsd, buffers.rearrangement, &buffers); swap_pointers(&buffers.rsd, &buffers.rearrangement); CUT_CHECK_ERROR(" Error in hyperplane_to_tiled.\n"); cudaThreadSynchronize(); timer_stop(9); // Update the variables. postprocessing_gpu(&buffers); // Compute the max-norms of newton iteration corrections. if (istep % inorm == 0) { l2norm_gpu(delunm, &buffers); if (ipr == 1 && id == 0) { printf(" RMS-norm of SSOR-iteration correction for first pde = %e.\n", delunm[0]); printf(" RMS-norm of SSOR-iteration correction for second pde = %e.\n", delunm[1]); printf(" RMS-norm of SSOR-iteration correction for third pde = %e.\n", delunm[2]); printf(" RMS-norm of SSOR-iteration correction for fourth pde = %e.\n", delunm[3]); printf(" RMS-norm of SSOR-iteration correction for fifth pde = %e.\n", delunm[4]); } else if (ipr == 2 && id == 0) { printf(" %d, %f.\n", istep, delunm[4]); } } // Compute the steady-state residuals. rhs_gpu(&buffers); // Compute the max-norms of newton iteration residuals. if (istep % inorm == 0 || istep == itmax ) { l2norm_gpu(rsdnm, &buffers); if (ipr == 1 && id == 0) { printf(" RMS-norm of steady state residual for first pde = %e.\n", rsdnm[0]); printf(" RMS-norm of steady state residual for second pde = %e.\n", rsdnm[1]); printf(" RMS-norm of steady state residual for third pde = %e.\n", rsdnm[2]); printf(" RMS-norm of steady state residual for fourth pde = %e.\n", rsdnm[3]); printf(" RMS-norm of steady state residual for fifth pde = %e.\n", rsdnm[4]); } } // Check the newton-iteration residuals against the tolerance levels. if (rsdnm[0] < tolrsd[0] && rsdnm[1] < tolrsd[1] && rsdnm[2] < tolrsd[2] && rsdnm[3] < tolrsd[3] && rsdnm[4] < tolrsd[4]) { if (ipr == 1 && id == 0) { printf(" Convergence was achieved after %d pseudo-time steps.\n", istep); return; } } } // End the timers. cudaThreadSynchronize(); timer_stop(0); // Pull back all data that might be necessary. tiled_to_flat(buffers.u, buffers.rearrangement, &buffers); swap_pointers(&buffers.u, &buffers.rearrangement); tiled_to_flat(buffers.rsd, buffers.rearrangement, &buffers); swap_pointers(&buffers.rsd, &buffers.rearrangement); tiled_to_flat(buffers.frct, buffers.rearrangement, &buffers); swap_pointers(&buffers.frct, &buffers.rearrangement); tiled_to_flat(buffers.flux, buffers.rearrangement, &buffers); swap_pointers(&buffers.flux, &buffers.rearrangement); cudaThreadSynchronize(); CUT_CHECK_ERROR(" Error in tiled_to_flat.\n"); cudaMemcpy(u_flat, buffers.u, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaMemcpy(rsd_flat, buffers.rsd, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaMemcpy(frct_flat, buffers.frct, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaMemcpy(flux_flat, buffers.flux, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); /** * C cleanup. */ free(wave2d_offsets); free(wave3d_offsets); free(thread_map); free(columns); free(rows); free_buffers(&buffers); } } cuda/rhs.cu0000644000175600017620000001165311477410615011316 0ustar sjpsjp/** * Wrapper function for rhs step. */ void rhs_gpu(luBuffers* buffers) { // Rhs setup. timer_start(8); rhs_setup_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->frct); cudaThreadSynchronize(); timer_pause(8); // Exchange borders of u with neighbouring processes. exchange_3_gpu(0, buffers); cudaThreadSynchronize(); timer_restart(8); // Xi-direction buffers->flux differences. // Horrible if statements due to templates on north and south. if (north == -1 && south == -1) { rhs_xi1_kernel<-1, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_xi3_kernel<-1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_xi_dissipation_kernel<-1, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (north == -1 && south != -1) { rhs_xi1_kernel<-1, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_xi3_kernel<0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_xi_dissipation_kernel<-1, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (north != -1 && south == -1) { rhs_xi1_kernel<0, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_xi3_kernel<-1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_xi_dissipation_kernel<0, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (north != -1 && south != -1) { rhs_xi1_kernel<0, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_xi3_kernel<0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_xi4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_xi_dissipation_kernel<0, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } cudaThreadSynchronize(); timer_pause(8); // Exchange borders of u with neighbouring processes. exchange_3_gpu(1, buffers); cudaThreadSynchronize(); timer_restart(8); // Eta-direction buffers->flux differences. // Horrible if statements due to templates on east and west. if (west == -1 && east == -1) { rhs_eta1_kernel<-1, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_eta3_kernel<-1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_eta_dissipation_kernel<-1, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (west == -1 && east != -1) { rhs_eta1_kernel<-1, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_eta3_kernel<0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_eta_dissipation_kernel<-1, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (west != -1 && east == -1) { rhs_eta1_kernel<0, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_eta3_kernel<-1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_eta_dissipation_kernel<0, -1> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } else if (west != -1 && east != -1) { rhs_eta1_kernel<0, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_eta3_kernel<0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_eta4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_eta_dissipation_kernel<0, 0> <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); } // Zeta-direction buffers->flux differences. rhs_zeta1_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_zeta2_kernel <<< rearGrid, rearBlock >>> (buffers->rsd, buffers->flux); rhs_zeta3_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->flux); rhs_zeta4_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd, buffers->flux); rhs_zeta_dissipation_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); cudaThreadSynchronize(); CUT_CHECK_ERROR("Error in rhs.\n"); cudaThreadSynchronize(); timer_stop(8); } cuda/preprocessing.cu0000644000175600017620000000050011477406151013372 0ustar sjpsjp/** * Wrapper function for preprocessing step. * Performs the SSOR iteration itself. */ void preprocessing_gpu(luBuffers* buffers) { timer_start(1); preprocessing_kernel <<< rearGrid, rearBlock >>> (buffers->rsd); CUT_CHECK_ERROR("Error in preprocessing.\n"); cudaThreadSynchronize(); timer_stop(1); } cuda/postprocessing.cu0000644000175600017620000000050111477406244013575 0ustar sjpsjp/** * Wrapper function for postprocessing step. * Updates the variables. */ void postprocessing_gpu(luBuffers* buffers) { timer_start(6); postprocessing_kernel <<< rearGrid, rearBlock >>> (buffers->u, buffers->rsd); CUT_CHECK_ERROR("Error in postprocessing.\n"); cudaThreadSynchronize(); timer_stop(6); } cuda/l2norm.cu0000644000175600017620000000122011477407656011733 0ustar sjpsjp/** * Wrapper function for l2norm on the gpu. */ void l2norm_gpu(fp_type* sum, luBuffers* buffers) { timer_start(7); int m = 0; for (m = 0; m < 5; m++) { sum[m] = 0.0; } cudaMemcpy(buffers->sum, sum, 5 * sizeof(fp_type), cudaMemcpyHostToDevice); l2norm_kernel <<< 5, 1 >>> (buffers->rsd, buffers->sum); cudaMemcpy(sum, buffers->sum, 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); MPI_Allreduce(MPI_IN_PLACE, sum, 5, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); for (m = 0; m < 5; m++) { sum[m] = sqrt ( sum[m] / ( (nx0-2)*(ny0-2)*(nz0-2) ) ); } CUT_CHECK_ERROR("Error in l2norm.\n"); cudaThreadSynchronize(); timer_stop(7); } cuda/exchange_3.cu0000644000175600017620000001265411477410702012525 0ustar sjpsjp/** * iex = 0: north/south. * iex = 1: east/west. */ void exchange_3_gpu(int iex, luBuffers* buffers) { /** * Local variables. */ MPI_Request mid; MPI_Status status; // Thread blocks / grid. dim3 unpackGrid, unpackBlock; dim3 packGrid, packBlock; /** * Communicate in the south and north directions. */ if (iex == 0) { if (north != -1) { MPI_Irecv(buf1_flat, 10*ny*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_n, MPI_COMM_WORLD, &mid); } /** * Send south. */ if (south != -1) { timer_start(11); // Pack. packGrid.x = ceil( ny / (fp_type) exblock ); packGrid.y = nz; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex3_pack_south <<< packGrid, packBlock >>> (buffers->buf, buffers->u); // Get buffer from the GPU. cudaMemcpy(buf_flat, buffers->buf, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(11); // Send. MPI_Send(buf_flat, 10*ny*nz, MPI_FP_TYPE, south, from_n, MPI_COMM_WORLD); } /** * Receive from north. */ if (north != -1) { MPI_Wait(&mid, &status); timer_start(11); // Copy the buffer to the GPU. cudaMemcpy(buffers->buf1, buf1_flat, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( ny / (fp_type) exblock ); unpackGrid.y = nz; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex3_unpack_north <<< unpackGrid, unpackBlock >>> (buffers->buf1, buffers->u); cudaThreadSynchronize(); timer_stop(11); } if (south != -1) { MPI_Irecv(buf1_flat, 10*ny*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &mid); } /** * Send north. */ if (north != -1) { timer_start(11); // Pack. packGrid.x = ceil( ny / (fp_type) exblock ); packGrid.y = nz; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex3_pack_north <<< packGrid, packBlock >>> (buffers->buf, buffers->u); // Get buffer from the GPU. cudaMemcpy(buf_flat, buffers->buf, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(11); MPI_Send(buf_flat, 10*ny*nz, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } /** * Receive from south. */ if (south != -1) { MPI_Wait(&mid, &status); timer_start(11); // Copy the buffer to the GPU. cudaMemcpy(buffers->buf1, buf1_flat, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( ny / (fp_type) exblock ); unpackGrid.y = nz; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex3_unpack_south <<< unpackGrid, unpackBlock >>> (buffers->buf1, buffers->u); cudaThreadSynchronize(); timer_stop(11); } /** * Communicate in the east and west directions. */ } else { if (west != -1) { MPI_Irecv(buf1_flat, 10*nx*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_w, MPI_COMM_WORLD, &mid); } /** * Send east. */ if (east != -1) { timer_start(11); // Pack. packGrid.x = ceil( nx / (fp_type) exblock ); packGrid.y = nz; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex3_pack_east <<< packGrid, packBlock >>> (buffers->buf, buffers->u); // Get buffer from the GPU. cudaMemcpy(buf_flat, buffers->buf, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(11); MPI_Send(buf_flat, 10*nx*nz, MPI_FP_TYPE, east, from_w, MPI_COMM_WORLD); } /** * Receive from west. */ if (west != -1) { MPI_Wait(&mid, &status); timer_start(11); // Copy the buffer to the GPU. cudaMemcpy(buffers->buf1, buf1_flat, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( nx / (fp_type) exblock ); unpackGrid.y = nz; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex3_unpack_west <<< unpackGrid, unpackBlock >>> (buffers->buf1, buffers->u); cudaThreadSynchronize(); timer_stop(11); } if (east != -1){ MPI_Irecv(buf1_flat, 10*nx*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &mid); } /** * Send west. */ if (west != -1) { timer_start(11); // Pack. packGrid.x = ceil( nx / (fp_type) exblock ); packGrid.y = nz; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex3_pack_west <<< packGrid, packBlock >>> (buffers->buf, buffers->u); // Get buffer from the GPU. cudaMemcpy(buf_flat, buffers->buf, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(11); MPI_Send(buf_flat, 10*nx*nz, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } /** * Receive from east. */ if (east != -1) { MPI_Wait(&mid, &status); timer_start(11); // Copy the buffer to the GPU. cudaMemcpy(buffers->buf1, buf1_flat, 10 * isiz3 * isiz2 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( nx / (fp_type) exblock ); unpackGrid.y = nz; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex3_unpack_east <<< unpackGrid, unpackBlock >>> (buffers->buf1, buffers->u); cudaThreadSynchronize(); timer_stop(11); } } } cuda/exchange_1.cu0000644000175600017620000001434511544626143012525 0ustar sjpsjp/** * iex = 0 : Receive north/west. * iex = 1 : Receive south/east. * iex = 2 : Send south/east. * iex = 3 : Send north/west. */ void exchange_1_gpu(int k, int iex, luBuffers* buffers) { MPI_Status status; // Thread blocks / grid. dim3 unpackGrid, unpackBlock; dim3 packGrid, packBlock; /** * Receive north/west. */ if (iex == 0) { if (north != -1) { // Receive from north. MPI_Recv(jbuf_flat, kblock * (jend-jst+1) * 5, MPI_FP_TYPE, north, from_n, MPI_COMM_WORLD, &status); timer_start(10); // Send the buffer to the GPU. cudaMemcpy(buffers->jbuffer, jbuf_flat, kblock * (jend-jst+1) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( (jend - jst + 1) / (fp_type) exblock ); unpackGrid.y = kblock; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex1_unpack_north <<< unpackGrid, unpackBlock >>> (buffers->jbuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); cudaThreadSynchronize(); timer_stop(10); } if (west != -1) { // Receive from west. MPI_Recv(ibuf_flat, kblock * (iend-ist+1) * 5, MPI_FP_TYPE, west, from_w, MPI_COMM_WORLD, &status); timer_start(10); // Send the buffer to the GPU. cudaMemcpy(buffers->ibuffer, ibuf_flat, kblock * (iend-ist+1) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( (iend - ist + 1) / (fp_type) exblock ); unpackGrid.y = kblock; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex1_unpack_west <<< unpackGrid, unpackBlock >>> (buffers->ibuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); cudaThreadSynchronize(); timer_stop(10); } /** * Receive south/east. */ } else if (iex == 1) { if (south != -1) { // Receive from south. MPI_Recv(jbuf_flat, kblock * (jend-jst+1) * 5, MPI_FP_TYPE, south, from_s, MPI_COMM_WORLD, &status); timer_start(10); // Send the buffer to the GPU. cudaMemcpy(buffers->jbuffer, jbuf_flat, kblock * (jend-jst+1) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( (jend - jst + 1) / (fp_type) exblock ); unpackGrid.y = kblock; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex1_unpack_south <<< unpackGrid, unpackBlock >>> (buffers->jbuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); cudaThreadSynchronize(); timer_stop(10); } if (east != -1) { // Receive from east. MPI_Recv(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, east, from_e, MPI_COMM_WORLD, &status); timer_start(10); // Send the buffer to the GPU. cudaMemcpy(buffers->ibuffer, ibuf_flat, kblock * (iend-ist+1) * 5 * sizeof(fp_type), cudaMemcpyHostToDevice); // Unpack. unpackGrid.x = ceil( (iend - ist + 1) / (fp_type) exblock ); unpackGrid.y = kblock; unpackGrid.z = 1; unpackBlock.x = exblock; unpackBlock.y = 1; unpackBlock.z = 1; ex1_unpack_east <<< unpackGrid, unpackBlock >>> (buffers->ibuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); cudaThreadSynchronize(); timer_stop(10); } /** * Send south/east. */ } else if (iex == 2) { if (south != -1) { timer_start(10); // Pack. packGrid.x = ceil( (jend - jst + 1) / (fp_type) exblock ); packGrid.y = kblock; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex1_pack_south <<< packGrid, packBlock >>> (buffers->jbuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Get buffer from the GPU. cudaMemcpy(jbuf_flat, buffers->jbuffer, kblock * (jend-jst+1) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(10); // Send south. MPI_Send(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, south, from_n, MPI_COMM_WORLD); } if (east != -1) { timer_start(10); // Pack. packGrid.x = ceil( (iend - ist + 1) / (fp_type) exblock ); packGrid.y = kblock; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex1_pack_east <<< packGrid, packBlock >>> (buffers->ibuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Get buffer from the GPU. cudaMemcpy(ibuf_flat, buffers->ibuffer, kblock * (iend-ist+1) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(10); // Send east. MPI_Send(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, east, from_w, MPI_COMM_WORLD); } /** * Send north/west. */ } else { if (north != -1) { timer_start(10); // Pack. packGrid.x = ceil( (jend - jst + 1) / (fp_type) exblock ); packGrid.y = kblock; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex1_pack_north <<< packGrid, packBlock >>> (buffers->jbuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Get buffer from the GPU. cudaMemcpy(jbuf_flat, buffers->jbuffer, kblock * (jend-jst+1) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(10); // Send north. MPI_Send(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } if (west != -1) { timer_start(10); // Pack. packGrid.x = ceil( (iend - ist + 1) / (fp_type) exblock ); packGrid.y = kblock; packGrid.z = 1; packBlock.x = exblock; packBlock.y = 1; packBlock.z = 1; ex1_pack_west <<< packGrid, packBlock >>> (buffers->ibuffer, buffers->rsd, k, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); // Get buffer from the GPU. cudaMemcpy(ibuf_flat, buffers->ibuffer, kblock * (iend-ist+1) * 5 * sizeof(fp_type), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); timer_stop(10); // Send west. MPI_Send(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } } } cuda/buts.cu0000644000175600017620000000075211457010262011465 0ustar sjpsjp/** * Wrapper function to replace buts(). * Blocks proceed in a wavefront over the grid. */ void buts_gpu(int k, luBuffers* buffers) { int wave = 0; for (wave = (isiz1 + 4) + (isiz2 + 4) + kblock - 3; wave >= 0; wave--) { buts_kernel <<< waveGrid, waveBlock >>> (buffers->rsd, buffers->u, wave, k, buffers->columns, buffers->rows, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); } CUT_CHECK_ERROR("Error in buts.\n"); cudaThreadSynchronize(); } cuda/blts.cu0000644000175600017620000000074711477406103011465 0ustar sjpsjp/** * Wrapper function to replace blts(). * Blocks proceed in a wavefront over the grid. */ void blts_gpu(int k, luBuffers* buffers) { int wave = 0; for (wave = 0; wave < (isiz1 + 4) + (isiz2 + 4) + kblock - 2; wave++) { blts_kernel <<< waveGrid, waveBlock >>> (buffers->rsd, buffers->u, wave, k, buffers->columns, buffers->rows, buffers->wave2d_offsets, buffers->wave3d_offsets, buffers->thread_map); } CUT_CHECK_ERROR("Error in blts.\n"); cudaThreadSynchronize(); } cuda/kernels/l2norm.cuh0000644000175600017620000000100311440474601013525 0ustar sjpsjp/** * CUDA kernel for calculating l2norm. * Currently calculated serially on 5 threads -- pretty awful. */ __global__ void l2norm_kernel(fp_type* rsd, fp_type* sum) { /** * Local variables. */ int i, j, k, m; m = blockIdx.x; fp_type lsum = 0.0e+00; // Compute the sum for this m. for (k = 1; k <= nz_d -2; k++) { for (j = jst_d; j <= jend_d; j++) { for (i = ist_d; i <= iend_d; i++) { fp_type v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } cuda/kernels/preprocessing.cuh0000644000175600017620000000147511440474564015224 0ustar sjpsjp/** * CUDA kernel for pre-processing step of iteration loop. * Carries out SSOR iteration. */ __global__ void preprocessing_kernel(fp_type* rsd) { int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check i and j are in the correct range. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { // Calculate index and offset. int t_index = tiled_index(k, j, i, 0); int t_offset = (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); rsd[t_index + 0 * t_offset] *= dt_d; rsd[t_index + 1 * t_offset] *= dt_d; rsd[t_index + 2 * t_offset] *= dt_d; rsd[t_index + 3 * t_offset] *= dt_d; rsd[t_index + 4 * t_offset] *= dt_d; } } cuda/kernels/postprocessing.cuh0000644000175600017620000000174211440474552015415 0ustar sjpsjp/** * CUDA kernel for post-processing step of iteration loop. * Updates the variables. */ __global__ void postprocessing_kernel(fp_type* u, fp_type* rsd) { int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check i and j are in the correct range. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >=1 && k <= nz_d - 2) { // Calculate index and offset. int t_index = tiled_index(k, j, i, 0); int t_offset = (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); u[t_index + 0 * t_offset] += tmp_d * rsd[t_index + 0 * t_offset]; u[t_index + 1 * t_offset] += tmp_d * rsd[t_index + 1 * t_offset]; u[t_index + 2 * t_offset] += tmp_d * rsd[t_index + 2 * t_offset]; u[t_index + 3 * t_offset] += tmp_d * rsd[t_index + 3 * t_offset]; u[t_index + 4 * t_offset] += tmp_d * rsd[t_index + 4 * t_offset]; } } cuda/kernels/data_rearrangement.cuh0000644000175600017620000001250411440474540016151 0ustar sjpsjp/** * Shift from flat to hyperplane layout. */ __global__ void flat_to_hyperplane_kernel(fp_type* flat_input, fp_type* hyperplane_output, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { int i, j; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = (blockIdx.y * rhsblock_y) + threadIdx.y; // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Iterate over blocks from level 0 to nz_d / blockDim.z. int k = 0; for (k = 0; k < nz_d; k++) { // For each thread, copy across each of its angles. int m = 0; for (m = 0; m < 5; m++) { hyperplane_output[hyperplane_index(k, j, i, m, wave_offset_2d, wave_offset_3d, thread_map_d)] = flat_input[flat_index(k, j, i, m)]; } } } } /** * Shift from hyperplane to flat layout. */ __global__ void hyperplane_to_flat_kernel(fp_type* hyperplane_input, fp_type* flat_output, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { int i, j; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = (blockIdx.y * rhsblock_y) + threadIdx.y; // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Iterate over blocks from level 0 to nz_d int k = 0; for (k = 0; k < nz_d; k++) { // For each thread, copy across each of its angles. int m = 0; for (m = 0; m < 5; m++) { flat_output[flat_index(k, j, i, m)] = hyperplane_input[hyperplane_index(k, j, i, m, wave_offset_2d, wave_offset_3d, thread_map_d)]; } } } } /** * Shift from flat to tiled layout. */ __global__ void flat_to_tiled_kernel(fp_type* flat_input, fp_type* tiled_output) { int i, j; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = (blockIdx.y * rhsblock_y) + threadIdx.y; // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Iterate over blocks from level 0 to nz_d / blockDim.z. int k = 0; for (k = 0; k < nz_d; k++) { // For each thread, copy across each of its angles. int m = 0; for (m = 0; m < 5; m++) { tiled_output[tiled_index(k, j, i, m)] = flat_input[flat_index(k, j, i, m)]; } } } } /** * Shift from tiled to flat layout. */ __global__ void tiled_to_flat_kernel(fp_type* tiled_input, fp_type* flat_output) { int i, j; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = (blockIdx.y * rhsblock_y) + threadIdx.y; // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Iterate over blocks from level 0 to nz_d int k = 0; for (k = 0; k < nz_d; k++) { // For each thread, copy across each of its angles. int m = 0; for (m = 0; m < 5; m++) { flat_output[flat_index(k, j, i, m)] = tiled_input[tiled_index(k, j, i, m)]; } } } } /** * Shift from tiled to hyperplane layout. */ __global__ void tiled_to_hyperplane_kernel(fp_type* tiled_input, fp_type* hyperplane_output, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Hyperplane index and offset. int h_index = hyperplane_index(k, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int h_offset = problem_height * (isiz2 + 4) * (isiz1 + 4); // Tiled index and offset. int t_index = tiled_index(k, j, i, 0); int t_offset = (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); hyperplane_output[h_index + 0 * h_offset] = tiled_input[t_index + 0 * t_offset]; hyperplane_output[h_index + 1 * h_offset] = tiled_input[t_index + 1 * t_offset]; hyperplane_output[h_index + 2 * h_offset] = tiled_input[t_index + 2 * t_offset]; hyperplane_output[h_index + 3 * h_offset] = tiled_input[t_index + 3 * t_offset]; hyperplane_output[h_index + 4 * h_offset] = tiled_input[t_index + 4 * t_offset]; } } /** * Shift from hyperplane to tiled layout. */ __global__ void hyperplane_to_tiled_kernel(fp_type* hyperplane_input, fp_type* tiled_output, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check we are within the right bounds for i and j. if (i < (isiz1 + 4) && j < (isiz2 + 4)) { // Hyperplane index and offset. int h_index = hyperplane_index(k, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int h_offset = problem_height * (isiz2 + 4) * (isiz1 + 4); // Tiled index and offset. int t_index = tiled_index(k, j, i, 0); int t_offset = (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); tiled_output[t_index + 0 * t_offset] = hyperplane_input[h_index + 0 * h_offset]; tiled_output[t_index + 1 * t_offset] = hyperplane_input[h_index + 1 * h_offset]; tiled_output[t_index + 2 * t_offset] = hyperplane_input[h_index + 2 * h_offset]; tiled_output[t_index + 3 * t_offset] = hyperplane_input[h_index + 3 * h_offset]; tiled_output[t_index + 4 * t_offset] = hyperplane_input[h_index + 4 * h_offset]; } } cuda/kernels/debug.cuh0000644000175600017620000000026111342773342013414 0ustar sjpsjp// Define a variadic macro for debug statement. #ifdef EMU #define debug(format, ...) printf(format, ## __VA_ARGS__) #else #define debug(format, ...) // An empty line. #endif cuda/kernels/exchange/exchange_3_send.cuh0000644000175600017620000000702511442157562017133 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Packs g into buf. __global__ void ex3_pack_south(fp_type* buf, fp_type* g) { // Check if thread is within range. int j = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (j <= ny_d + 1 && k <= nz_d - 1) { int ipos1 = k*ny_d + j - 2; int ipos2 = ipos1 + ny_d*nz_d; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx_d, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx_d, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx_d, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx_d, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx_d, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx_d + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx_d + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx_d + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx_d + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx_d + 1, 4)]; } } // Packs g into buf. __global__ void ex3_pack_north(fp_type* buf, fp_type* g) { // Check if thread is within range. int j = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (j <= ny_d + 1 && k <= nz_d - 1) { int ipos1 = k*ny_d + j - 2; int ipos2 = ipos1 + ny_d*nz_d; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } // Packs g into buf. __global__ void ex3_pack_east(fp_type* buf, fp_type* g) { // Check if thread is within range. int i = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (i <= nx_d + 1 && k <= nz_d - 1) { int ipos1 = k*nx_d + i - 2; int ipos2 = ipos1 + nx_d*nz_d; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny_d, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny_d, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny_d, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny_d, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny_d, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny_d + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny_d + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny_d + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny_d + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny_d + 1, i, 4)]; } } // Packs g into buf. __global__ void ex3_pack_west(fp_type* buf, fp_type* g) { // Check if thread is within range. int i = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (i <= nx_d + 1 && k <= nz_d - 1) { int ipos1 = k*nx_d + i - 2; int ipos2 = ipos1 + nx_d*nz_d; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } cuda/kernels/exchange/exchange_3_recv.cuh0000644000175600017620000000711511442157612017135 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Packs g into buf. __global__ void ex3_unpack_north(fp_type* buf1, fp_type* g) { // Check if thread is within range. int j = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (j <= ny_d + 1 && k <= nz_d - 1) { int ipos1 = k*ny_d + j - 2; int ipos2 = ipos1 + ny_d*nz_d; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } // Packs g into buf. __global__ void ex3_unpack_south(fp_type* buf1, fp_type* g) { // Check if thread is within range. int j = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (j <= ny_d + 1 && k <= nz_d - 1) { int ipos1 = k*ny_d + j - 2; int ipos2 = ipos1 + ny_d*nz_d; g[tiled_index(k, j, nx_d + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx_d + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx_d + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx_d + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx_d + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx_d + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx_d + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx_d + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx_d + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx_d + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } // Packs g into buf. __global__ void ex3_unpack_west(fp_type* buf1, fp_type* g) { // Check if thread is within range. int i = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (i <= nx_d + 1 && k <= nz_d - 1) { int ipos1 = k*nx_d + i - 2; int ipos2 = ipos1 + nx_d*nz_d; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } // Packs g into buf. __global__ void ex3_unpack_east(fp_type* buf1, fp_type* g) { // Check if thread is within range. int i = 2 + (blockIdx.x * blockDim.x) + threadIdx.x; int k = blockIdx.y; if (i <= nx_d + 1 && k <= nz_d - 1) { int ipos1 = k*nx_d + i - 2; int ipos2 = ipos1 + nx_d*nz_d; g[tiled_index(k, ny_d + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny_d + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny_d + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny_d + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny_d + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny_d + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny_d + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny_d + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny_d + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny_d + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } cuda/kernels/exchange/exchange_1_send.cuh0000644000175600017620000000634611442155204017125 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Packs g into jsend. __global__ void ex1_pack_south(fp_type* jsend, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int j = jst_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (j <= jend_d && k + z - (kblock - 1) > 0 && k + z - (kblock - 1) < isiz3 - 1) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx_d + 1, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (jend_d-jst_d+1) * 5) + ((j - jst_d) * 5); jsend[b_index + 0] = g[h_index + 0 * ursdblock]; jsend[b_index + 1] = g[h_index + 1 * ursdblock]; jsend[b_index + 2] = g[h_index + 2 * ursdblock]; jsend[b_index + 3] = g[h_index + 3 * ursdblock]; jsend[b_index + 4] = g[h_index + 4 * ursdblock]; } } // Packs g into isend. __global__ void ex1_pack_east(fp_type* isend, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int i = ist_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (i <= iend_d && k + z - (kblock - 1) > 0 && k + z - (kblock - 1) < isiz3 - 1) { int h_index = hyperplane_index(k + z - (kblock - 1), ny_d + 1, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (iend_d-ist_d+1) * 5) + ((i - ist_d) * 5); isend[b_index + 0] = g[h_index + 0 * ursdblock]; isend[b_index + 1] = g[h_index + 1 * ursdblock]; isend[b_index + 2] = g[h_index + 2 * ursdblock]; isend[b_index + 3] = g[h_index + 3 * ursdblock]; isend[b_index + 4] = g[h_index + 4 * ursdblock]; } } // Packs g into jsend. __global__ void ex1_pack_north(fp_type* jsend, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int j = jst_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (j <= jend_d && k + z > 0 && k + z < isiz3 - 1) { int h_index = hyperplane_index(k + z, j, 2, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (jend_d-jst_d+1) * 5) + ((j - jst_d) * 5); jsend[b_index + 0] = g[h_index + 0 * ursdblock]; jsend[b_index + 1] = g[h_index + 1 * ursdblock]; jsend[b_index + 2] = g[h_index + 2 * ursdblock]; jsend[b_index + 3] = g[h_index + 3 * ursdblock]; jsend[b_index + 4] = g[h_index + 4 * ursdblock]; } } // Packs g into isend. __global__ void ex1_pack_west(fp_type* isend, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int i = ist_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (i <= iend_d && k + z > 0 && k + z < isiz3 - 1) { int h_index = hyperplane_index(k + z, 2, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (iend_d-ist_d+1) * 5) + ((i - ist_d) * 5); isend[b_index + 0] = g[h_index + 0 * ursdblock]; isend[b_index + 1] = g[h_index + 1 * ursdblock]; isend[b_index + 2] = g[h_index + 2 * ursdblock]; isend[b_index + 3] = g[h_index + 3 * ursdblock]; isend[b_index + 4] = g[h_index + 4 * ursdblock]; } } cuda/kernels/exchange/exchange_1_recv.cuh0000644000175600017620000000634411542410563017134 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Unpacks jrecv into g. __global__ void ex1_unpack_north(fp_type* jrecv, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int j = jst_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (j <= jend_d && k + z > 0 && k + z < isiz3 - 1) { int h_index = hyperplane_index(k + z, j, 1, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (jend_d-jst_d+1) * 5) + ((j - jst_d) * 5); g[h_index + 0 * ursdblock] = jrecv[b_index + 0]; g[h_index + 1 * ursdblock] = jrecv[b_index + 1]; g[h_index + 2 * ursdblock] = jrecv[b_index + 2]; g[h_index + 3 * ursdblock] = jrecv[b_index + 3]; g[h_index + 4 * ursdblock] = jrecv[b_index + 4]; } } // Unpacks irecv into g. __global__ void ex1_unpack_west(fp_type* irecv, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int i = ist_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (i <= iend_d && k + z > 0 && k + z < isiz3 - 1) { int h_index = hyperplane_index(k + z, 1, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (iend_d-ist_d+1) * 5) + ((i - ist_d) * 5); g[h_index + 0 * ursdblock] = irecv[b_index + 0]; g[h_index + 1 * ursdblock] = irecv[b_index + 1]; g[h_index + 2 * ursdblock] = irecv[b_index + 2]; g[h_index + 3 * ursdblock] = irecv[b_index + 3]; g[h_index + 4 * ursdblock] = irecv[b_index + 4]; } } // Unpacks jrecv into g. __global__ void ex1_unpack_south(fp_type* jrecv, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int j = jst_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (j <= jend_d && k + z - (kblock - 1) > 0 && k + z - (kblock - 1) < isiz3 - 1) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx_d + 2, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (jend_d-jst_d+1) * 5) + ((j - jst_d) * 5); g[h_index + 0 * ursdblock] = jrecv[b_index + 0]; g[h_index + 1 * ursdblock] = jrecv[b_index + 1]; g[h_index + 2 * ursdblock] = jrecv[b_index + 2]; g[h_index + 3 * ursdblock] = jrecv[b_index + 3]; g[h_index + 4 * ursdblock] = jrecv[b_index + 4]; } } // Unpacks irecv into g. __global__ void ex1_unpack_east(fp_type* irecv, fp_type* g, int k, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { // Check if thread is within range. int i = ist_d + (blockIdx.x * blockDim.x) + threadIdx.x; int z = blockIdx.y; if (i <= iend_d && k + z - (kblock - 1) > 0 && k + z - (kblock - 1) < isiz3 - 1) { int h_index = hyperplane_index(k + z - (kblock - 1), ny_d + 2, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); int b_index = (z * (iend_d-ist_d+1) * 5) + ((i - ist_d) * 5); g[h_index + 0 * ursdblock] = irecv[b_index + 0]; g[h_index + 1 * ursdblock] = irecv[b_index + 1]; g[h_index + 2 * ursdblock] = irecv[b_index + 2]; g[h_index + 3 * ursdblock] = irecv[b_index + 3]; g[h_index + 4 * ursdblock] = irecv[b_index + 4]; } } cuda/kernels/jacld/jacld_d.cuh0000644000175600017620000001045511440543137014765 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of d[k][j][i][l][m]. */ template __device__ fp_type jacld_d_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = fpone + dt_d * fptwo * ( tx1_d * dx1_d + ty1_d * dy1_d + tz1_d * dz1_d ); if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - r43 * c34 * tmp2 * u1 ) + ty1_d * ( - c34 * tmp2 * u1 ) + tz1_d * ( - c34 * tmp2 * u1 ) ); } if (l == 1) result = fpone + dt_d * fptwo * ( tx1_d * r43 * c34 * tmp1 + ty1_d * c34 * tmp1 + tz1_d * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx2_d + ty1_d * dy2_d + tz1_d * dz2_d ); if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - c34 * tmp2 * u2 ) + ty1_d * ( - r43 * c34 * tmp2 * u2 ) + tz1_d * ( - c34 * tmp2 * u2 ) ); } if (l == 1) result = fpzero; if (l == 2) result = fpone + dt_d * fptwo * ( tx1_d * c34 * tmp1 + ty1_d * r43 * c34 * tmp1 + tz1_d * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx3_d + ty1_d * dy3_d + tz1_d * dz3_d ); if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - c34 * tmp2 * u3 ) + ty1_d * ( - c34 * tmp2 * u3 ) + tz1_d * ( - r43 * c34 * tmp2 * u3 ) ); } if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = fpone + dt_d * fptwo * ( tx1_d * c34 * tmp1 + ty1_d * c34 * tmp1 + tz1_d * r43 * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx4_d + ty1_d * dy4_d + tz1_d * dz4_d ); if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) ); } if (l == 1) { result = dt_d * fptwo * ( tx1_d * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1_d * ( c34 - c1345 ) * tmp2 * u1 + tz1_d * ( c34 - c1345 ) * tmp2 * u1 ); } if (l == 2) { result = dt_d * fptwo * ( tx1_d * ( c34 - c1345 ) * tmp2 * u2 + ty1_d * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1_d * ( c34 - c1345 ) * tmp2 * u2 ); } if (l == 3) { result = dt_d * fptwo * ( tx1_d * ( c34 - c1345 ) * tmp2 * u3 + ty1_d * ( c34 - c1345 ) * tmp2 * u3 + tz1_d * ( r43 * c34 - c1345 ) * tmp2 * u3 ); } if (l == 4) result = fpone + dt_d * fptwo * ( tx1_d * c1345 * tmp1 + ty1_d * c1345 * tmp1 + tz1_d * c1345 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx5_d + ty1_d * dy5_d + tz1_d * dz5_d ); } return result; } cuda/kernels/jacld/jacld_c.cuh0000644000175600017620000000651011440543034014755 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of c[k][j][i][l][m]. */ template __device__ fp_type jacld_c_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * tx1_d * dx1_d; if (l == 1) result = -dt_d * tx2_d; if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = -dt_d * tx2_d * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * fphalf * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) -dt_d * tx1_d * ( -r43 * c34 * tmp2 * u1 ); if (l == 1) result = -dt_d * tx2_d * ( ( fptwo - c2 ) * ( u1 * tmp1 ) ) -dt_d * tx1_d * ( r43 * c34 * tmp1 ) -dt_d * tx1_d * dx2_d; if (l == 2) result = -dt_d * tx2_d * ( -c2 * ( u2 * tmp1 ) ); if (l == 3) result = -dt_d * tx2_d * ( -c2 * ( u3 * tmp1 ) ); if (l == 4) result = -dt_d * tx2_d * c2; } if (m == 2) { if (l == 0) result = -dt_d * tx2_d * ( - ( u1 * u2 ) * tmp2 ) -dt_d * tx1_d * ( - c34 * tmp2 * u2 ); if (l == 1) result = -dt_d * tx2_d * ( u2 * tmp1 ); if (l == 2) result = -dt_d * tx2_d * ( u1 * tmp1 ) -dt_d * tx1_d * ( c34 * tmp1 ) -dt_d * tx1_d * dx3_d; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) result = -dt_d * tx2_d * ( - ( u1 * u3 ) * tmp2 ) -dt_d * tx1_d * ( - c34 * tmp2 * u3 ); if (l == 1) result = -dt_d * tx2_d * ( u3 * tmp1 ); if (l == 2) result = fpzero; if (l == 3) result = -dt_d * tx2_d * ( u1 * tmp1 ) -dt_d * tx1_d * ( c34 * tmp1 ) -dt_d * tx1_d * dx4_d; if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) result = -dt_d * tx2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) -dt_d * tx1_d * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 ); if (l == 1) result = -dt_d * tx2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( fpthree * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) -dt_d * tx1_d * ( r43 * c34 - c1345 ) * tmp2 * u1; if (l == 2) result = -dt_d * tx2_d * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt_d * tx1_d * ( c34 - c1345 ) * tmp2 * u2; if (l == 3) result = -dt_d * tx2_d * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt_d * tx1_d * ( c34 - c1345 ) * tmp2 * u3; if (l == 4) result = -dt_d * tx2_d * ( c1 * ( u1 * tmp1 ) ) - dt_d * tx1_d * c1345 * tmp1 - dt_d * tx1_d * dx5_d; } return result; } cuda/kernels/jacld/jacld_b.cuh0000644000175600017620000000653611440542506014767 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of b[k][j][i][l][m]. */ template __device__ fp_type jacld_b_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * ty1_d * dy1_d; if (l == 1) result = fpzero; if (l == 2) result = -dt_d * ty2_d; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = -dt_d * ty2_d * ( - ( u1 * u2 ) * tmp2 ) -dt_d * ty1_d * ( - c34 * tmp2 * u1 ); if (l == 1) result = -dt_d * ty2_d * ( u2 * tmp1 ) -dt_d * ty1_d * ( c34 * tmp1 ) -dt_d * ty1_d * dy2_d; if (l == 2) result = -dt_d * ty2_d * ( u1 * tmp1 ); if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) result = -dt_d * ty2_d * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + fphalf * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) -dt_d * ty1_d * ( -r43 * c34 * tmp2 * u2 ); if (l == 1) result = -dt_d * ty2_d * ( - c2 * ( u1 * tmp1 ) ); if (l == 2) result = -dt_d * ty2_d * ( ( fptwo - c2 ) * ( u2 * tmp1 ) ) -dt_d * ty1_d * ( r43 * c34 * tmp1 ) -dt_d * ty1_d * dy3_d; if (l == 3) result = -dt_d * ty2_d * ( - c2 * ( u3 * tmp1 ) ); if (l == 4) result = - dt_d * ty2_d * c2; } if (m == 3) { if (l == 0) result = -dt_d * ty2_d * ( - ( u2 * u3 ) * tmp2 ) - dt_d * ty1_d * ( -c34 * tmp2 * u3 ); if (l == 1) result = fpzero; if (l == 2) result = -dt_d * ty2_d * ( u3 * tmp1 ); if (l == 3) result = -dt_d * ty2_d * ( u2 * tmp1 ) -dt_d * ty1_d * ( c34 * tmp1 ) -dt_d * ty1_d * dy4_d; if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) result = -dt_d * ty2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) -dt_d * ty1_d * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 ); if (l == 1) result = -dt_d * ty2_d * ( - c2 * ( u1 * u2 ) * tmp2 ) -dt_d * ty1_d * ( c34 - c1345 ) * tmp2 * u1; if (l == 2) result = -dt_d * ty2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( u1 * u1 + fpthree * u2 * u2 + u3 * u3 ) * tmp2 ) ) -dt_d * ty1_d * ( r43 * c34 - c1345 ) * tmp2 * u2; if (l == 3) result = -dt_d * ty2_d * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt_d * ty1_d * ( c34 - c1345 ) * tmp2 * u3; if (l == 4) result = -dt_d * ty2_d * ( c1 * ( u2 * tmp1 ) ) -dt_d * ty1_d * c1345 * tmp1 -dt_d * ty1_d * dy5_d; } return result; } cuda/kernels/jacld/jacld_a.cuh0000644000175600017620000000645611440542377014775 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of a[k][j][i][l][m]. */ template __device__ fp_type jacld_a_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * tz1_d * dz1_d; if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = -dt_d * tz2_d; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = -dt_d * tz2_d * ( - ( u1 * u3 ) * tmp2 ) -dt_d * tz1_d * ( - c34 * tmp2 * u1 ); if (l == 1) result = -dt_d * tz2_d * ( u3 * tmp1 ) -dt_d * tz1_d * c34 * tmp1 -dt_d * tz1_d * dz2_d; if (l == 2) result = fpzero; if (l == 3) result = -dt_d * tz2_d * ( u1 * tmp1 ); if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) result = -dt_d * tz2_d * ( - ( u2 * u3 ) * tmp2 ) - dt_d * tz1_d * ( -c34 * tmp2 * u2 ); if (l == 1) result = fpzero; if (l == 2) result = -dt_d * tz2_d * ( u3 * tmp1 ) -dt_d * tz1_d * ( c34 * tmp1 ) -dt_d * tz1_d * dz3_d; if (l == 3) result = -dt_d * tz2_d * ( u2 * tmp1 ); if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) result = -dt_d * tz2_d * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + fphalf * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) -dt_d * tz1_d * ( -r43 * c34 * tmp2 * u3 ); if (l == 1) result = -dt_d * tz2_d * ( - c2 * ( u1 * tmp1 ) ); if (l == 2) result = -dt_d * tz2_d * ( - c2 * ( u2 * tmp1 ) ); if (l == 3) result = -dt_d * tz2_d * ( fptwo - c2 ) * ( u3 * tmp1 ) -dt_d * tz1_d * ( r43 * c34 * tmp1 ) -dt_d * tz1_d * dz4_d; if (l == 4) result = -dt_d * tz2_d * c2; } if (m == 4) { if (l == 0) result = -dt_d * tz2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) -dt_d * tz1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 ); if (l == 1) result = -dt_d * tz2_d * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt_d * tz1_d * ( c34 - c1345 ) * tmp2 * u1; if (l == 2) result = -dt_d * tz2_d * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt_d * tz1_d * ( c34 - c1345 ) * tmp2 * u2; if (l == 3) result = -dt_d * tz2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( u1 * u1 + u2 * u2 + fpthree * u3 * u3 ) * tmp2 ) ) -dt_d * tz1_d * ( r43 * c34 - c1345 ) * tmp2 * u3; if (l == 4) result = -dt_d * tz2_d * ( c1 * ( u3 * tmp1 ) ) -dt_d * tz1_d * c1345 * tmp1 -dt_d * tz1_d * dz5_d; } return result; } cuda/kernels/buts/buts.cuh0000644000175600017620000003424711440673613014272 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) /** * CUDA kernel to compute the regular-sparse, block upper triangular solution. * v <-- ( U-inv ) * v * // d = d, udx = a, udy = b, udz = c */ __global__ void buts_kernel(fp_type* v, fp_type* u, const int wave, const int starting_k, int* columns_d, int* rows_d, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { /** * Local variables. */ int i, j, k; fp_type tmp, tmp1; // Constants. const fp_type fpone = 1.0e+00; // Use five temporary variables for aggregation. fp_type v0, v1, v2, v3, v4; fp_type vn0, vn1, vn2, vn3, vn4; fp_type un0, un1, un2, un3, un4; // Replace tmat55 with 25 fp_types to be stored in registers. fp_type tmat00, tmat01, tmat02, tmat03, tmat04, tmat10, tmat11, tmat12, tmat13, tmat14, tmat20, tmat21, tmat22, tmat23, tmat24, tmat30, tmat31, tmat32, tmat33, tmat34, tmat40, tmat41, tmat42, tmat43, tmat44; // Calculate actual thread i int tid = (blockIdx.x * blockDim.x) + threadIdx.x; int threads = (gridDim.x * blockDim.x); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { i = columns_d[cell]; j = rows_d[cell]; k = (starting_k - (kblock - 1)) + (wave - (i + j)); // Check if this thread is active in this wave. /*int mem_offset = cell; if ( (wave) >= kblock - 1 ) { mem_offset = mem_offset - wave_offset_2d[(wave) - (kblock - 1)]; }*/ int depth = (wave - (i + j)); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2 && depth >= 0 && depth <= kblock - 1) { //&& mem_offset < wave_offset_3d[wave + 1] - wave_offset_3d[wave]) { int index = hyperplane_index(k, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int below = hyperplane_index(k+1, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in v neighbour. vn0 = v[below + 0 * ursdblock]; vn1 = v[below + 1 * ursdblock]; vn2 = v[below + 2 * ursdblock]; vn3 = v[below + 3 * ursdblock]; vn4 = v[below + 4 * ursdblock]; // Read in u neighbour, for calculation of c. un0 = u[below + 0 * ursdblock]; un1 = u[below + 1 * ursdblock]; un2 = u[below + 2 * ursdblock]; un3 = u[below + 3 * ursdblock]; un4 = u[below + 4 * ursdblock]; vn0 = v[below + 0 * ursdblock]; v0 = omega_d * ( jacu_c_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = omega_d * ( jacu_c_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = omega_d * ( jacu_c_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = omega_d * ( jacu_c_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = omega_d * ( jacu_c_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[below + 1 * ursdblock]; v0 = v0 + omega_d * ( jacu_c_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 + omega_d * ( jacu_c_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 + omega_d * ( jacu_c_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 + omega_d * ( jacu_c_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 + omega_d * ( jacu_c_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[below + 2 * ursdblock]; v0 = v0 + omega_d * ( jacu_c_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 + omega_d * ( jacu_c_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 + omega_d * ( jacu_c_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 + omega_d * ( jacu_c_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 + omega_d * ( jacu_c_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[below + 3 * ursdblock]; v0 = v0 + omega_d * ( jacu_c_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 + omega_d * ( jacu_c_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 + omega_d * ( jacu_c_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 + omega_d * ( jacu_c_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 + omega_d * ( jacu_c_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[below + 4 * ursdblock]; v0 = v0 + omega_d * ( jacu_c_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 + omega_d * ( jacu_c_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 + omega_d * ( jacu_c_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 + omega_d * ( jacu_c_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 + omega_d * ( jacu_c_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); // Update the values of v based on its neighbours in the j direction. int south = hyperplane_index(k, j+1, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in v neighbour. vn0 = v[south + 0 * ursdblock]; vn1 = v[south + 1 * ursdblock]; vn2 = v[south + 2 * ursdblock]; vn3 = v[south + 3 * ursdblock]; vn4 = v[south + 4 * ursdblock]; // Read in u neighbour, for calculation of b. un0 = u[south + 0 * ursdblock]; un1 = u[south + 1 * ursdblock]; un2 = u[south + 2 * ursdblock]; un3 = u[south + 3 * ursdblock]; un4 = u[south + 4 * ursdblock]; vn0 = v[south + 0 * ursdblock]; v0 = v0 + omega_d * ( jacu_b_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = v1 + omega_d * ( jacu_b_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = v2 + omega_d * ( jacu_b_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = v3 + omega_d * ( jacu_b_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = v4 + omega_d * ( jacu_b_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[south + 1 * ursdblock]; v0 = v0 + omega_d * ( jacu_b_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 + omega_d * ( jacu_b_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 + omega_d * ( jacu_b_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 + omega_d * ( jacu_b_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 + omega_d * ( jacu_b_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[south + 2 * ursdblock]; v0 = v0 + omega_d * ( jacu_b_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 + omega_d * ( jacu_b_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 + omega_d * ( jacu_b_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 + omega_d * ( jacu_b_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 + omega_d * ( jacu_b_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[south + 3 * ursdblock]; v0 = v0 + omega_d * ( jacu_b_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 + omega_d * ( jacu_b_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 + omega_d * ( jacu_b_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 + omega_d * ( jacu_b_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 + omega_d * ( jacu_b_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[south + 4 * ursdblock]; v0 = v0 + omega_d * ( jacu_b_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 + omega_d * ( jacu_b_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 + omega_d * ( jacu_b_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 + omega_d * ( jacu_b_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 + omega_d * ( jacu_b_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int east = hyperplane_index(k, j, i+1, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in v neighbour. vn0 = v[east + 0 * ursdblock]; vn1 = v[east + 1 * ursdblock]; vn2 = v[east + 2 * ursdblock]; vn3 = v[east + 3 * ursdblock]; vn4 = v[east + 4 * ursdblock]; // Read in u neighbour, for calculation of a. un0 = u[east + 0 * ursdblock]; un1 = u[east + 1 * ursdblock]; un2 = u[east + 2 * ursdblock]; un3 = u[east + 3 * ursdblock]; un4 = u[east + 4 * ursdblock]; vn0 = v[east + 0 * ursdblock]; v0 = v0 + omega_d * ( jacu_a_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = v1 + omega_d * ( jacu_a_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = v2 + omega_d * ( jacu_a_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = v3 + omega_d * ( jacu_a_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = v4 + omega_d * ( jacu_a_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[east + 1 * ursdblock]; v0 = v0 + omega_d * ( jacu_a_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 + omega_d * ( jacu_a_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 + omega_d * ( jacu_a_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 + omega_d * ( jacu_a_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 + omega_d * ( jacu_a_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[east + 2 * ursdblock]; v0 = v0 + omega_d * ( jacu_a_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 + omega_d * ( jacu_a_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 + omega_d * ( jacu_a_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 + omega_d * ( jacu_a_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 + omega_d * ( jacu_a_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[east + 3 * ursdblock]; v0 = v0 + omega_d * ( jacu_a_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 + omega_d * ( jacu_a_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 + omega_d * ( jacu_a_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 + omega_d * ( jacu_a_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 + omega_d * ( jacu_a_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[east + 4 * ursdblock]; v0 = v0 + omega_d * ( jacu_a_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 + omega_d * ( jacu_a_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 + omega_d * ( jacu_a_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 + omega_d * ( jacu_a_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 + omega_d * ( jacu_a_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); /** * Diagonal block inversion. */ // Read in u values. un0 = u[index + 0 * ursdblock]; un1 = u[index + 1 * ursdblock]; un2 = u[index + 2 * ursdblock]; un3 = u[index + 3 * ursdblock]; un4 = u[index + 4 * ursdblock]; tmat00 = jacu_d_value<0,0>(un0, un1, un2, un3, un4); tmat10 = jacu_d_value<1,0>(un0, un1, un2, un3, un4); tmat20 = jacu_d_value<2,0>(un0, un1, un2, un3, un4); tmat30 = jacu_d_value<3,0>(un0, un1, un2, un3, un4); tmat40 = jacu_d_value<4,0>(un0, un1, un2, un3, un4); tmat01 = jacu_d_value<0,1>(un0, un1, un2, un3, un4); tmat11 = jacu_d_value<1,1>(un0, un1, un2, un3, un4); tmat21 = jacu_d_value<2,1>(un0, un1, un2, un3, un4); tmat31 = jacu_d_value<3,1>(un0, un1, un2, un3, un4); tmat41 = jacu_d_value<4,1>(un0, un1, un2, un3, un4); tmat02 = jacu_d_value<0,2>(un0, un1, un2, un3, un4); tmat12 = jacu_d_value<1,2>(un0, un1, un2, un3, un4); tmat22 = jacu_d_value<2,2>(un0, un1, un2, un3, un4); tmat32 = jacu_d_value<3,2>(un0, un1, un2, un3, un4); tmat42 = jacu_d_value<4,2>(un0, un1, un2, un3, un4); tmat03 = jacu_d_value<0,3>(un0, un1, un2, un3, un4); tmat13 = jacu_d_value<1,3>(un0, un1, un2, un3, un4); tmat23 = jacu_d_value<2,3>(un0, un1, un2, un3, un4); tmat33 = jacu_d_value<3,3>(un0, un1, un2, un3, un4); tmat43 = jacu_d_value<4,3>(un0, un1, un2, un3, un4); tmat04 = jacu_d_value<0,4>(un0, un1, un2, un3, un4); tmat14 = jacu_d_value<1,4>(un0, un1, un2, un3, un4); tmat24 = jacu_d_value<2,4>(un0, un1, un2, un3, un4); tmat34 = jacu_d_value<3,4>(un0, un1, un2, un3, un4); tmat44 = jacu_d_value<4,4>(un0, un1, un2, un3, un4); // ip = 0. tmp1 = fpone / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = fpone / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = fpone / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = fpone / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[index + 0 * ursdblock] -= v0; v[index + 1 * ursdblock] -= v1; v[index + 2 * ursdblock] -= v2; v[index + 3 * ursdblock] -= v3; v[index + 4 * ursdblock] -= v4; } } } cuda/kernels/blts/blts.cuh0000644000175600017620000003330711457045424014245 0ustar sjpsjp// Some macros to increase readability. #define ursdblock (problem_height * (isiz2 + 4) * (isiz1 + 4)) /** * CUDA kernel to compute the regular-sparse, block lower triangular solution. * v <-- ( L-inv ) * v */ __global__ void blts_kernel(fp_type* v, fp_type* u, const int wave, const int starting_k, int* columns_d, int* rows_d, int* wave_offset_2d, int* wave_offset_3d, int* thread_map_d) { /** * Local variables. */ int i, j, k; fp_type tmp, tmp1; // Constants. const fp_type fpone = 1.0e+00; // Use five temporary variables for aggregation. fp_type v0, v1, v2, v3, v4; fp_type vn0, vn1, vn2, vn3, vn4; fp_type un0, un1, un2, un3, un4; // Replace tmat[5][5] with 25 fp_types to be stored in registers. fp_type tmat00, tmat01, tmat02, tmat03, tmat04, tmat10, tmat11, tmat12, tmat13, tmat14, tmat20, tmat21, tmat22, tmat23, tmat24, tmat30, tmat31, tmat32, tmat33, tmat34, tmat40, tmat41, tmat42, tmat43, tmat44; // Calculate actual thread i int tid = (blockIdx.x * blockDim.x) + threadIdx.x; int threads = (gridDim.x * blockDim.x); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { i = columns_d[cell]; j = rows_d[cell]; k = starting_k + (wave - (i + j)); int depth = (wave - (i + j)); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2 && depth >= 0 && depth <= kblock - 1) { int index = hyperplane_index(k, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Initialise values of v. v0 = v[index + 0 * ursdblock]; v1 = v[index + 1 * ursdblock]; v2 = v[index + 2 * ursdblock]; v3 = v[index + 3 * ursdblock]; v4 = v[index + 4 * ursdblock]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). int above = hyperplane_index(k-1, j, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in u neighbours, for calculation of a. un0 = u[above + 0 * ursdblock]; un1 = u[above + 1 * ursdblock]; un2 = u[above + 2 * ursdblock]; un3 = u[above + 3 * ursdblock]; un4 = u[above + 4 * ursdblock]; vn0 = v[above + 0 * ursdblock]; v0 = v0 - omega_d * ( jacld_a_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = v1 - omega_d * ( jacld_a_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = v2 - omega_d * ( jacld_a_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = v3 - omega_d * ( jacld_a_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = v4 - omega_d * ( jacld_a_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[above + 1 * ursdblock]; v0 = v0 - omega_d * ( jacld_a_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 - omega_d * ( jacld_a_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 - omega_d * ( jacld_a_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 - omega_d * ( jacld_a_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 - omega_d * ( jacld_a_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[above + 2 * ursdblock]; v0 = v0 - omega_d * ( jacld_a_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 - omega_d * ( jacld_a_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 - omega_d * ( jacld_a_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 - omega_d * ( jacld_a_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 - omega_d * ( jacld_a_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[above + 3 * ursdblock]; v0 = v0 - omega_d * ( jacld_a_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 - omega_d * ( jacld_a_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 - omega_d * ( jacld_a_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 - omega_d * ( jacld_a_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 - omega_d * ( jacld_a_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[above + 4 * ursdblock]; v0 = v0 - omega_d * ( jacld_a_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 - omega_d * ( jacld_a_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 - omega_d * ( jacld_a_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 - omega_d * ( jacld_a_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 - omega_d * ( jacld_a_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). int north = hyperplane_index(k, j-1, i, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in u neighbours, for calculation of b. un0 = u[north + 0 * ursdblock]; un1 = u[north + 1 * ursdblock]; un2 = u[north + 2 * ursdblock]; un3 = u[north + 3 * ursdblock]; un4 = u[north + 4 * ursdblock]; vn0 = v[north + 0 * ursdblock]; v0 = v0 - omega_d * ( jacld_b_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = v1 - omega_d * ( jacld_b_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = v2 - omega_d * ( jacld_b_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = v3 - omega_d * ( jacld_b_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = v4 - omega_d * ( jacld_b_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[north + 1 * ursdblock]; v0 = v0 - omega_d * ( jacld_b_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 - omega_d * ( jacld_b_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 - omega_d * ( jacld_b_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 - omega_d * ( jacld_b_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 - omega_d * ( jacld_b_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[north + 2 * ursdblock]; v0 = v0 - omega_d * ( jacld_b_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 - omega_d * ( jacld_b_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 - omega_d * ( jacld_b_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 - omega_d * ( jacld_b_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 - omega_d * ( jacld_b_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[north + 3 * ursdblock]; v0 = v0 - omega_d * ( jacld_b_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 - omega_d * ( jacld_b_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 - omega_d * ( jacld_b_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 - omega_d * ( jacld_b_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 - omega_d * ( jacld_b_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[north + 4 * ursdblock]; v0 = v0 - omega_d * ( jacld_b_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 - omega_d * ( jacld_b_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 - omega_d * ( jacld_b_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 - omega_d * ( jacld_b_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 - omega_d * ( jacld_b_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). int west = hyperplane_index(k, j, i-1, 0, wave_offset_2d, wave_offset_3d, thread_map_d); // Read in u neighbours, for calculation of c. un0 = u[west + 0 * ursdblock]; un1 = u[west + 1 * ursdblock]; un2 = u[west + 2 * ursdblock]; un3 = u[west + 3 * ursdblock]; un4 = u[west + 4 * ursdblock]; vn0 = v[west + 0 * ursdblock]; v0 = v0 - omega_d * ( jacld_c_value<0,0>(un0, un1, un2, un3, un4) * vn0 ); v1 = v1 - omega_d * ( jacld_c_value<0,1>(un0, un1, un2, un3, un4) * vn0 ); v2 = v2 - omega_d * ( jacld_c_value<0,2>(un0, un1, un2, un3, un4) * vn0 ); v3 = v3 - omega_d * ( jacld_c_value<0,3>(un0, un1, un2, un3, un4) * vn0 ); v4 = v4 - omega_d * ( jacld_c_value<0,4>(un0, un1, un2, un3, un4) * vn0 ); vn1 = v[west + 1 * ursdblock]; v0 = v0 - omega_d * ( jacld_c_value<1,0>(un0, un1, un2, un3, un4) * vn1 ); v1 = v1 - omega_d * ( jacld_c_value<1,1>(un0, un1, un2, un3, un4) * vn1 ); v2 = v2 - omega_d * ( jacld_c_value<1,2>(un0, un1, un2, un3, un4) * vn1 ); v3 = v3 - omega_d * ( jacld_c_value<1,3>(un0, un1, un2, un3, un4) * vn1 ); v4 = v4 - omega_d * ( jacld_c_value<1,4>(un0, un1, un2, un3, un4) * vn1 ); vn2 = v[west + 2 * ursdblock]; v0 = v0 - omega_d * ( jacld_c_value<2,0>(un0, un1, un2, un3, un4) * vn2 ); v1 = v1 - omega_d * ( jacld_c_value<2,1>(un0, un1, un2, un3, un4) * vn2 ); v2 = v2 - omega_d * ( jacld_c_value<2,2>(un0, un1, un2, un3, un4) * vn2 ); v3 = v3 - omega_d * ( jacld_c_value<2,3>(un0, un1, un2, un3, un4) * vn2 ); v4 = v4 - omega_d * ( jacld_c_value<2,4>(un0, un1, un2, un3, un4) * vn2 ); vn3 = v[west + 3 * ursdblock]; v0 = v0 - omega_d * ( jacld_c_value<3,0>(un0, un1, un2, un3, un4) * vn3 ); v1 = v1 - omega_d * ( jacld_c_value<3,1>(un0, un1, un2, un3, un4) * vn3 ); v2 = v2 - omega_d * ( jacld_c_value<3,2>(un0, un1, un2, un3, un4) * vn3 ); v3 = v3 - omega_d * ( jacld_c_value<3,3>(un0, un1, un2, un3, un4) * vn3 ); v4 = v4 - omega_d * ( jacld_c_value<3,4>(un0, un1, un2, un3, un4) * vn3 ); vn4 = v[west + 4 * ursdblock]; v0 = v0 - omega_d * ( jacld_c_value<4,0>(un0, un1, un2, un3, un4) * vn4 ); v1 = v1 - omega_d * ( jacld_c_value<4,1>(un0, un1, un2, un3, un4) * vn4 ); v2 = v2 - omega_d * ( jacld_c_value<4,2>(un0, un1, un2, un3, un4) * vn4 ); v3 = v3 - omega_d * ( jacld_c_value<4,3>(un0, un1, un2, un3, un4) * vn4 ); v4 = v4 - omega_d * ( jacld_c_value<4,4>(un0, un1, un2, un3, un4) * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. un0 = u[index + 0 * ursdblock]; un1 = u[index + 1 * ursdblock]; un2 = u[index + 2 * ursdblock]; un3 = u[index + 3 * ursdblock]; un4 = u[index + 4 * ursdblock]; tmat00 = jacld_d_value<0,0>(un0, un1, un2, un3, un4); tmat10 = jacld_d_value<1,0>(un0, un1, un2, un3, un4); tmat20 = jacld_d_value<2,0>(un0, un1, un2, un3, un4); tmat30 = jacld_d_value<3,0>(un0, un1, un2, un3, un4); tmat40 = jacld_d_value<4,0>(un0, un1, un2, un3, un4); tmat01 = jacld_d_value<0,1>(un0, un1, un2, un3, un4); tmat11 = jacld_d_value<1,1>(un0, un1, un2, un3, un4); tmat21 = jacld_d_value<2,1>(un0, un1, un2, un3, un4); tmat31 = jacld_d_value<3,1>(un0, un1, un2, un3, un4); tmat41 = jacld_d_value<4,1>(un0, un1, un2, un3, un4); tmat02 = jacld_d_value<0,2>(un0, un1, un2, un3, un4); tmat12 = jacld_d_value<1,2>(un0, un1, un2, un3, un4); tmat22 = jacld_d_value<2,2>(un0, un1, un2, un3, un4); tmat32 = jacld_d_value<3,2>(un0, un1, un2, un3, un4); tmat42 = jacld_d_value<4,2>(un0, un1, un2, un3, un4); tmat03 = jacld_d_value<0,3>(un0, un1, un2, un3, un4); tmat13 = jacld_d_value<1,3>(un0, un1, un2, un3, un4); tmat23 = jacld_d_value<2,3>(un0, un1, un2, un3, un4); tmat33 = jacld_d_value<3,3>(un0, un1, un2, un3, un4); tmat43 = jacld_d_value<4,3>(un0, un1, un2, un3, un4); tmat04 = jacld_d_value<0,4>(un0, un1, un2, un3, un4); tmat14 = jacld_d_value<1,4>(un0, un1, un2, un3, un4); tmat24 = jacld_d_value<2,4>(un0, un1, un2, un3, un4); tmat34 = jacld_d_value<3,4>(un0, un1, un2, un3, un4); tmat44 = jacld_d_value<4,4>(un0, un1, un2, un3, un4); // ip = 0. tmp1 = fpone / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = fpone / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = fpone / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = fpone / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[index + 0 * ursdblock] = v0; v[index + 1 * ursdblock] = v1; v[index + 2 * ursdblock] = v2; v[index + 3 * ursdblock] = v3; v[index + 4 * ursdblock] = v4; } } } cuda/kernels/jacu/jacu_d.cuh0000644000175600017620000001045411440544055014476 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of d[k][j][i][l][m]. */ template __device__ fp_type jacu_d_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = fpone + dt_d * fptwo * ( tx1_d * dx1_d + ty1_d * dy1_d + tz1_d * dz1_d ); if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - r43 * c34 * tmp2 * u1 ) + ty1_d * ( - c34 * tmp2 * u1 ) + tz1_d * ( - c34 * tmp2 * u1 ) ); } if (l == 1) result = fpone + dt_d * fptwo * ( tx1_d * r43 * c34 * tmp1 + ty1_d * c34 * tmp1 + tz1_d * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx2_d + ty1_d * dy2_d + tz1_d * dz2_d ); if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - c34 * tmp2 * u2 ) + ty1_d * ( - r43 * c34 * tmp2 * u2 ) + tz1_d * ( - c34 * tmp2 * u2 ) ); } if (l == 1) result = fpzero; if (l == 2) result = fpone + dt_d * fptwo * ( tx1_d * c34 * tmp1 + ty1_d * r43 * c34 * tmp1 + tz1_d * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx3_d + ty1_d * dy3_d + tz1_d * dz3_d ); if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - c34 * tmp2 * u3 ) + ty1_d * ( - c34 * tmp2 * u3 ) + tz1_d * ( - r43 * c34 * tmp2 * u3 ) ); } if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = fpone + dt_d * fptwo * ( tx1_d * c34 * tmp1 + ty1_d * c34 * tmp1 + tz1_d * r43 * c34 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx4_d + ty1_d * dy4_d + tz1_d * dz4_d ); if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) { result = dt_d * fptwo * ( tx1_d * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) ); } if (l == 1) { result = dt_d * fptwo * ( tx1_d * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1_d * ( c34 - c1345 ) * tmp2 * u1 + tz1_d * ( c34 - c1345 ) * tmp2 * u1 ); } if (l == 2) { result = dt_d * fptwo * ( tx1_d * ( c34 - c1345 ) * tmp2 * u2 + ty1_d * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1_d * ( c34 - c1345 ) * tmp2 * u2 ); } if (l == 3) { result = dt_d * fptwo * ( tx1_d * ( c34 - c1345 ) * tmp2 * u3 + ty1_d * ( c34 - c1345 ) * tmp2 * u3 + tz1_d * ( r43 * c34 - c1345 ) * tmp2 * u3 ); } if (l == 4) result = fpone + dt_d * fptwo * ( tx1_d * c1345 * tmp1 + ty1_d * c1345 * tmp1 + tz1_d * c1345 * tmp1 ) + dt_d * fptwo * ( tx1_d * dx5_d + ty1_d * dy5_d + tz1_d * dz5_d ); } return result; } cuda/kernels/jacu/jacu_c.cuh0000644000175600017620000000645411440543772014507 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of c[k][j][i][l][m]. */ template __device__ fp_type jacu_c_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * tz1_d * dz1_d; if (l == 1) result = fpzero; if (l == 2) result = fpzero; if (l == 3) result = dt_d * tz2_d; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = dt_d * tz2_d * ( - ( u1 * u3 ) * tmp2 ) - dt_d * tz1_d * ( - c34 * tmp2 * u1 ); if (l == 1) result = dt_d * tz2_d * ( u3 * tmp1 ) - dt_d * tz1_d * c34 * tmp1 - dt_d * tz1_d * dz2_d; if (l == 2) result = fpzero; if (l == 3) result = dt_d * tz2_d * ( u1 * tmp1 ); if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) result = dt_d * tz2_d * ( - ( u2 * u3 ) * tmp2 ) - dt_d * tz1_d * ( - c34 * tmp2 * u2 ); if (l == 1) result = fpzero; if (l == 2) result = dt_d * tz2_d * ( u3 * tmp1 ) - dt_d * tz1_d * ( c34 * tmp1 ) - dt_d * tz1_d * dz3_d; if (l == 3) result = dt_d * tz2_d * ( u2 * tmp1 ); if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) result = dt_d * tz2_d * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + fphalf * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) -dt_d * tz1_d * ( - r43 * c34 * tmp2 * u3 ); if (l == 1) result = dt_d * tz2_d * ( - c2 * ( u1 * tmp1 ) ); if (l == 2) result = dt_d * tz2_d * ( - c2 * ( u2 * tmp1 ) ); if (l == 3) result = dt_d * tz2_d * ( fptwo - c2 ) * ( u3 * tmp1 ) - dt_d * tz1_d * ( r43 * c34 * tmp1 ) - dt_d * tz1_d * dz4_d; if (l == 4) result = dt_d * tz2_d * c2; } if (m == 4) { if (l == 0) result = dt_d * tz2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt_d * tz1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 ); if (l == 1) result = dt_d * tz2_d * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt_d * tz1_d * ( c34 - c1345 ) * tmp2 * u1; if (l == 2) result = dt_d * tz2_d * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt_d * tz1_d * ( c34 - c1345 ) * tmp2 * u2; if (l == 3) result = dt_d * tz2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( u1 * u1 + u2 * u2 + fpthree * u3 * u3 ) * tmp2 ) ) - dt_d * tz1_d * ( r43 * c34 - c1345 ) * tmp2 * u3; if (l == 4) result = dt_d * tz2_d * ( c1 * ( u3 * tmp1 ) ) - dt_d * tz1_d * c1345 * tmp1 - dt_d * tz1_d * dz5_d; } return result; } cuda/kernels/jacu/jacu_b.cuh0000644000175600017620000000653111440543635014500 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of b[k][j][i][l][m]. */ template __device__ fp_type jacu_b_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * ty1_d * dy1_d; if (l == 1) result = fpzero; if (l == 2) result = dt_d * ty2_d; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = dt_d * ty2_d * ( - ( u1 * u2 ) * tmp2 ) - dt_d * ty1_d * ( - c34 * tmp2 * u1 ); if (l == 1) result = dt_d * ty2_d * ( u2 * tmp1 ) - dt_d * ty1_d * ( c34 * tmp1 ) - dt_d * ty1_d * dy2_d; if (l == 2) result = dt_d * ty2_d * ( u1 * tmp1 ); if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 2) { if (l == 0) result = dt_d * ty2_d * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + fphalf * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt_d * ty1_d * ( - r43 * c34 * tmp2 * u2 ); if (l == 1) result = dt_d * ty2_d * ( - c2 * ( u1 * tmp1 ) ); if (l == 2) result = dt_d * ty2_d * ( ( fptwo - c2 ) * ( u2 * tmp1 ) ) - dt_d * ty1_d * ( r43 * c34 * tmp1 ) - dt_d * ty1_d * dy3_d; if (l == 3) result = dt_d * ty2_d * ( - c2 * ( u3 * tmp1 ) ); if (l == 4) result = dt_d * ty2_d * c2; } if (m == 3) { if (l == 0) result = dt_d * ty2_d * ( - ( u2 * u3 ) * tmp2 ) - dt_d * ty1_d * ( - c34 * tmp2 * u3 ); if (l == 1) result = fpzero; if (l == 2) result = dt_d * ty2_d * ( u3 * tmp1 ); if (l == 3) result = dt_d * ty2_d * ( u2 * tmp1 ) - dt_d * ty1_d * ( c34 * tmp1 ) - dt_d * ty1_d * dy4_d; if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) result = dt_d * ty2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt_d * ty1_d * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 ); if (l == 1) result = dt_d * ty2_d * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt_d * ty1_d * ( c34 - c1345 ) * tmp2 * u1; if (l == 2) result = dt_d * ty2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( u1 * u1 + fpthree * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt_d * ty1_d * ( r43 * c34 - c1345 ) * tmp2 * u2; if (l == 3) result = dt_d * ty2_d * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt_d * ty1_d * ( c34 - c1345 ) * tmp2 * u3; if (l == 4) result = dt_d * ty2_d * ( c1 * ( u2 * tmp1 ) ) - dt_d * ty1_d * c1345 * tmp1 - dt_d * ty1_d * dy5_d; } return result; } cuda/kernels/jacu/jacu_a.cuh0000644000175600017620000000651011440543555014475 0ustar sjpsjp/** * Device function for calculating, just-in-time, the value of a[k][j][i][l][m]. */ template __device__ fp_type jacu_a_value(const fp_type u0, const fp_type u1, const fp_type u2, const fp_type u3, const fp_type u4) { // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type r43 = ( fpfour / fpthree ); const fp_type c1345 = c1 * c3 * c4 * c5; const fp_type c34 = c3 * c4; const fp_type tmp1 = fpone / u0; const fp_type tmp2 = tmp1 * tmp1; const fp_type tmp3 = tmp1 * tmp2; fp_type result; if (m == 0) { if (l == 0) result = -dt_d * tx1_d * dx1_d; if (l == 1) result = dt_d * tx2_d; if (l == 2) result = fpzero; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 1) { if (l == 0) result = dt_d * tx2_d * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * fphalf * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt_d * tx1_d * ( - r43 * c34 * tmp2 * u1 ); if (l == 1) result = dt_d * tx2_d * ( ( fptwo - c2 ) * ( u1 * tmp1 ) ) - dt_d * tx1_d * ( r43 * c34 * tmp1 ) - dt_d * tx1_d * dx2_d; if (l == 2) result = dt_d * tx2_d * ( - c2 * ( u2 * tmp1 ) ); if (l == 3) result = dt_d * tx2_d * ( - c2 * ( u3 * tmp1 ) ); if (l == 4) result = dt_d * tx2_d * c2; } if (m == 2) { if (l == 0) result = dt_d * tx2_d * ( - ( u1 * u2 ) * tmp2 ) - dt_d * tx1_d * ( - c34 * tmp2 * u2 ); if (l == 1) result = dt_d * tx2_d * ( u2 * tmp1 ); if (l == 2) result = dt_d * tx2_d * ( u1 * tmp1 ) -dt_d * tx1_d * ( c34 * tmp1 ) -dt_d * tx1_d * dx3_d; if (l == 3) result = fpzero; if (l == 4) result = fpzero; } if (m == 3) { if (l == 0) result = dt_d * tx2_d * ( - ( u1 * u3 ) * tmp2 ) - dt_d * tx1_d * ( - c34 * tmp2 * u3 ); if (l == 1) result = dt_d * tx2_d * ( u3 * tmp1 ); if (l == 2) result = fpzero; if (l == 3) result = dt_d * tx2_d * ( u1 * tmp1 ) -dt_d * tx1_d * ( c34 * tmp1 ) -dt_d * tx1_d * dx4_d; if (l == 4) result = fpzero; } if (m == 4) { if (l == 0) result = dt_d * tx2_d * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt_d * tx1_d * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 ); if (l == 1) result = dt_d * tx2_d * ( c1 * ( u4 * tmp1 ) - fphalf * c2 * ( ( fpthree * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt_d * tx1_d * ( r43 * c34 - c1345 ) * tmp2 * u1; if (l == 2) result = dt_d * tx2_d * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt_d * tx1_d * ( c34 - c1345 ) * tmp2 * u2; if (l == 3) result = dt_d * tx2_d * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt_d * tx1_d * ( c34 - c1345 ) * tmp2 * u3; if (l == 4) result = dt_d * tx2_d * ( c1 * ( u1 * tmp1 ) ) - dt_d * tx1_d * c1345 * tmp1 - dt_d * tx1_d * dx5_d; } return result; } cuda/kernels/rhs/rhs_setup.cuh0000644000175600017620000000153611440727744015150 0ustar sjpsjp// Macro! #define ursdtile ((rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3)) /** * CUDA kernel for updating rsd based on frct. */ __global__ void rhs_setup_kernel(fp_type* rsd, fp_type* frct) { // Calculate i and j values. const int i = (blockIdx.x * rhsblock_x) + threadIdx.x; const int j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; const int k = (blockIdx.y / rhsgrid_y); if (i >= 2 && i <= nx_d + 1 && j >= 2 && j <= ny_d + 1 && k >= 0 && k <= nz_d - 1) { const int index = tiled_index(k, j, i, 0); rsd[index + 0 * ursdtile] = - frct[index + 0 * ursdtile]; rsd[index + 1 * ursdtile] = - frct[index + 1 * ursdtile]; rsd[index + 2 * ursdtile] = - frct[index + 2 * ursdtile]; rsd[index + 3 * ursdtile] = - frct[index + 3 * ursdtile]; rsd[index + 4 * ursdtile] = - frct[index + 4 * ursdtile]; } } cuda/kernels/rhs/zeta/rhs_zeta_dissipation.cuh0000644000175600017620000000451211440546017020311 0ustar sjpsjp/** * CUDA kernel for the fourth-order dissipation in the zeta direction. */ __global__ void rhs_zeta_dissipation_kernel(fp_type* u, fp_type* rsd) { /** * Local variables. */ int i, j, k, m; // Constants. const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp_d * ( + fpfive * u[tiled_index(1, j, i, m)] - fpfour * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp_d * ( - fpfour * u[tiled_index(1, j, i, m)] + fpsix * u[tiled_index(2, j, i, m)] - fpfour * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } // If k is in range, update rsd. if (k >= 3 && k <= nz_d - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp_d * ( u[tiled_index(k-2, j, i, m)] - fpfour * u[tiled_index(k-1, j, i, m)] + fpsix * u[tiled_index(k, j, i, m)] - fpfour * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } if (k == nz_d - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz_d-3, j, i, m)] = rsd[tiled_index(nz_d-3, j, i, m)] - dssp_d * ( u[tiled_index(nz_d-5, j, i, m)] - fpfour * u[tiled_index(nz_d-4, j, i, m)] + fpsix * u[tiled_index(nz_d-3, j, i, m)] - fpfour * u[tiled_index(nz_d-2, j, i, m)] ); } } if (k == nz_d - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz_d-2, j, i, m)] = rsd[tiled_index(nz_d-2, j, i, m)] - dssp_d * ( u[tiled_index(nz_d-4, j, i, m)] - fpfour * u[tiled_index(nz_d-3, j, i, m)] + fpfive * u[tiled_index(nz_d-2, j, i, m)] ); } } } } cuda/kernels/rhs/zeta/rhs_zeta4.cuh0000644000175600017620000000420611440545704015771 0ustar sjpsjp/** * CUDA kernel for the fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __global__ void rhs_zeta4_kernel(fp_type* u, fp_type* rsd, fp_type* flux) { /** * Local variables. */ int i, j, k; // Some constants. const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type fptwo = 2.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1_d * tz1_d * ( u[tiled_index(k-1, j, i, 0)] - fptwo * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3_d * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2_d * tz1_d * ( u[tiled_index(k-1, j, i, 1)] - fptwo * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3_d * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3_d * tz1_d * ( u[tiled_index(k-1, j, i, 2)] - fptwo * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3_d * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4_d * tz1_d * ( u[tiled_index(k-1, j, i, 3)] - fptwo * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3_d * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5_d * tz1_d * ( u[tiled_index(k-1, j, i, 4)] - fptwo * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } cuda/kernels/rhs/zeta/rhs_zeta3.cuh0000644000175600017620000000345511440546137015776 0ustar sjpsjp/** * CUDA kernel for the third part of zeta-direction flux differences. * Update flux based on u. */ __global__ void rhs_zeta3_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ int i, j, k; fp_type tmp; fp_type u21k, u31k, u41k, u51k; fp_type u21km1, u31km1, u41km1, u51km1; // Some constants. const fp_type c1 = c1_def; const fp_type c5 = c5_def; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 1) { tmp = fpone / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = fpone / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3_d * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3_d * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (fpfour/fpthree) * tz3_d * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = fphalf * ( fpone - c1 * c5 ) * tz3_d * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (fpone/fpsix) * tz3_d * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3_d * ( u51k - u51km1 ); } } cuda/kernels/rhs/zeta/rhs_zeta2.cuh0000644000175600017620000000241111440474434015764 0ustar sjpsjp/** * CUDA kernel for the second part of zeta-direction flux differences. * Update rsd based on u. */ __global__ void rhs_zeta2_kernel(fp_type* rsd, fp_type* flux) { /** * Local variables. */ int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2_d * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2_d * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2_d * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2_d * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2_d * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } cuda/kernels/rhs/zeta/rhs_zeta1.cuh0000644000175600017620000000251511440545526015771 0ustar sjpsjp/** * CUDA kernel for the first part of zeta-direction flux differences. * Update flux based on u. */ __global__ void rhs_zeta1_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ int i, j, k; fp_type q, u41; // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type fphalf = 0.50e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 0 && k <= nz_d - 1) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = fphalf * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } cuda/kernels/rhs/eta/rhs_eta_dissipation.cuh0000644000175600017620000000511611440733024017722 0ustar sjpsjp/** * CUDA kernel for the fourth-order dissipation in the eta-direction. */ template __global__ void rhs_eta_dissipation_kernel(fp_type* u, fp_type* rsd) { /** * Local variables. */ int i, j, k, m; fp_type jst1, jend1; // Constants const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check that i and j are allowable values. if (i >= ist_d && i <= iend_d && k >= 1 && k <= nz_d - 2) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp_d * ( + fpfive * u[tiled_index(k, 3, i, m)] - fpfour * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp_d * ( - fpfour * u[tiled_index(k, 3, i, m)] + fpsix * u[tiled_index(k, 4, i, m)] - fpfour * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny_d + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny_d - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp_d * ( u[tiled_index(k, j-2, i, m)] - fpfour * u[tiled_index(k, j-1, i, m)] + fpsix * u[tiled_index(k, j, i, m)] - fpfour * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny_d - 1) { rsd[tiled_index(k, ny_d-1, i, m)] = rsd[tiled_index(k, ny_d-1, i, m)] - dssp_d * ( u[tiled_index(k, ny_d-3, i, m)] - fpfour * u[tiled_index(k, ny_d-2, i, m)] + fpsix * u[tiled_index(k, ny_d-1, i, m)] - fpfour * u[tiled_index(k, ny_d, i, m)] ); } if (j == ny_d) { rsd[tiled_index(k, ny_d, i, m)] = rsd[tiled_index(k, ny_d, i, m)] - dssp_d * ( u[tiled_index(k, ny_d-2, i, m)] - fpfour * u[tiled_index(k, ny_d-1, i, m)] + fpfive * u[tiled_index(k, ny_d, i, m)] ); } } } } } cuda/kernels/rhs/eta/rhs_eta4.cuh0000644000175600017620000000451211440732724015405 0ustar sjpsjp// Macro! #define ursdtile ((rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3)) /** * CUDA kernel for the fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __global__ void rhs_eta4_kernel(fp_type* u, fp_type* rsd, fp_type* flux) { // Some constants. const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type fptwo = 2.0e+00; // Calculate i and j values. const int i = (blockIdx.x * rhsblock_x) + threadIdx.x; const int j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; const int k = (blockIdx.y / rhsgrid_y); // Check that i and j are allowable values. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { // Calculate the indices. const int index = tiled_index(k,j,i,0); const int index1 = tiled_index(k,j+1,i,0); const int index2 = tiled_index(k,j-1,i,0); rsd[index + 0 * ursdtile] = rsd[index + 0 * ursdtile] + dy1_d * ty1_d * ( u[index2 + 0 * ursdtile] - fptwo * u[index + 0 * ursdtile] + u[index1 + 0 * ursdtile] ); rsd[index + 1 * ursdtile] = rsd[index + 1 * ursdtile] + ty3_d * c3 * c4 * ( flux[index1 + 1 * ursdtile] - flux[index + 1 * ursdtile] ) + dy2_d * ty1_d * ( u[index2 + 1 * ursdtile] - fptwo * u[index + 1 * ursdtile] + u[index1 + 1 * ursdtile] ); rsd[index + 2 * ursdtile] = rsd[index + 2 * ursdtile] + ty3_d * c3 * c4 * (flux[index1 + 2 * ursdtile] - flux[index + 2 * ursdtile] ) + dy3_d * ty1_d * ( u[index2 + 2 * ursdtile] - fptwo * u[index + 2 * ursdtile] + u[index1 + 2 * ursdtile] ); rsd[index + 3 * ursdtile] = rsd[index + 3 * ursdtile] + ty3_d * c3 * c4 * (flux[index1 + 3 * ursdtile] - flux[index + 3 * ursdtile] ) + dy4_d * ty1_d * ( u[index2 + 3 * ursdtile] - fptwo * u[index + 3 * ursdtile] + u[index1 + 3 * ursdtile] ); rsd[index + 4 * ursdtile] = rsd[index + 4 * ursdtile] + ty3_d * c3 * c4 * (flux[index1 + 4 * ursdtile] - flux[index + 4 * ursdtile] ) + dy5_d * ty1_d * ( u[index2 + 4 * ursdtile] - fptwo * u[index + 4 * ursdtile] + u[index1 + 4 * ursdtile] ); } } cuda/kernels/rhs/eta/rhs_eta3.cuh0000644000175600017620000000423611440732356015410 0ustar sjpsjp// Macro! #define ursdtile ((rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3)) /** * CUDA kernel for the third part of eta-direction flux differences. * Update flux based on u. */ template __global__ void rhs_eta3_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ int L2; // Some constants. const fp_type c1 = c1_def; const fp_type c5 = c5_def; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. const int i = (blockIdx.x * rhsblock_x) + threadIdx.x; const int j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; const int k = (blockIdx.y / rhsgrid_y); // Update L2 based on east. if (east != -1) { L2 = ny_d + 2; } if (east == -1) { L2 = ny_d + 1; } // Check that i and j are allowable values. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= L2 && k >= 1 && k <= nz_d - 2) { const int index = tiled_index(k, j, i, 0); const int index1 = tiled_index(k, j-1, i, 0); const fp_type tmp = fpone / u[index + 0 * ursdtile]; const fp_type tmp1 = fpone / u[index1 + 0 * ursdtile]; const fp_type u21j = tmp * u[index + 1 * ursdtile]; const fp_type u21jm1 = tmp1 * u[index1 + 1 * ursdtile]; flux[index + 1 * ursdtile] = ty3_d * ( u21j - u21jm1 ); const fp_type u31j = tmp * u[index + 2 * ursdtile]; const fp_type u31jm1 = tmp1 * u[index1 + 2 * ursdtile]; flux[index + 2 * ursdtile] = (fpfour/fpthree) * ty3_d * (u31j - u31jm1); const fp_type u41j = tmp * u[index + 3 * ursdtile]; const fp_type u41jm1 = tmp1 * u[index1 + 3 * ursdtile]; flux[index + 3 * ursdtile] = ty3_d * ( u41j - u41jm1 ); const fp_type u51j = tmp * u[index + 4 * ursdtile]; const fp_type u51jm1 = tmp1 * u[index1 + 4 * ursdtile]; flux[index + 4 * ursdtile] = fphalf * ( fpone - c1 * c5 ) * ty3_d * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (fpone/fpsix) * ty3_d * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3_d * ( u51j - u51jm1 ); } } cuda/kernels/rhs/eta/rhs_eta2.cuh0000644000175600017620000000245011440731632015377 0ustar sjpsjp// Macro! #define ursdtile ((rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3)) /** * CUDA kernel for the second part of eta-direction flux differences. * Update rsd based on flux. */ __global__ void rhs_eta2_kernel(fp_type* rsd, fp_type* flux) { // Calculate i and j values. const int i = (blockIdx.x * rhsblock_x) + threadIdx.x; const int j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; const int k = (blockIdx.y / rhsgrid_y); // Check that i and j are allowable values. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { // Calculate the indices. const int index = tiled_index(k, j, i, 0); const int index1 = tiled_index(k, j+1, i, 0); const int index2 = tiled_index(k, j-1, i, 0); rsd[index + 0 * ursdtile] -= ty2_d * ( flux[index1 + 0 * ursdtile] - flux[index2 + 0 * ursdtile] ); rsd[index + 1 * ursdtile] -= ty2_d * ( flux[index1 + 1 * ursdtile] - flux[index2 + 1 * ursdtile] ); rsd[index + 2 * ursdtile] -= ty2_d * ( flux[index1 + 2 * ursdtile] - flux[index2 + 2 * ursdtile] ); rsd[index + 3 * ursdtile] -= ty2_d * ( flux[index1 + 3 * ursdtile] - flux[index2 + 3 * ursdtile] ); rsd[index + 4 * ursdtile] -= ty2_d * ( flux[index1 + 4 * ursdtile] - flux[index2 + 4 * ursdtile] ); } } cuda/kernels/rhs/eta/rhs_eta1.cuh0000644000175600017620000000305511440733127015401 0ustar sjpsjp// Macro! #define ursdtile ((rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3)) /** * CUDA kernel for the first part of eta-direction flux differences. * Update flux based on u. */ template __global__ void rhs_eta1_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ fp_type q, u31; int L1, L2; // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type fphalf = 0.50e+00; // Calculate i and j values. const int i = (blockIdx.x * rhsblock_x) + threadIdx.x; const int j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; const int k = (blockIdx.y / rhsgrid_y); // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny_d + 2; } if (east == -1) { L2 = ny_d + 1; } if (i >= ist_d && i <= iend_d && j >= L1 && j <= L2 && k >= 1 && k <= nz_d - 2) { // Read in the u values. const int index = tiled_index(k, j, i, 0); // Update flux. const fp_type u2 = u[index + 2 * ursdtile]; flux[index + 0 * ursdtile] = u2; const fp_type u0 = u[index + 0 * ursdtile]; u31 = u2 / u0; const fp_type u1 = u[index + 1 * ursdtile]; const fp_type u3 = u[index + 3 * ursdtile]; q = fphalf * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[index + 1 * ursdtile] = u1 * u31; flux[index + 3 * ursdtile] = u3 * u31; const fp_type u4 = u[index + 4 * ursdtile]; flux[index + 2 * ursdtile] = u2 * u31 + c2 * ( u4 - q ); flux[index + 4 * ursdtile] = ( c1 * u4 - c2 * q ) * u31; } } cuda/kernels/rhs/xi/rhs_xi_dissipation.cuh0000644000175600017620000000463111440545473017452 0ustar sjpsjp/** * CUDA kernel for the fourth-order dissipation step in xi-direction. */ template __global__ void rhs_xi_dissipation_kernel(fp_type* u, fp_type* rsd) { /** * Local variables. */ int i, j, k, m; fp_type ist1, iend1; // Constants. const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check that j and k are allowable values. if (j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d -2) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp_d * ( + fpfive * u[tiled_index(k, j, 3, m)] - fpfour * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp_d * ( - fpfour * u[tiled_index(k, j, 3, m)] + fpsix * u[tiled_index(k, j, 4, m)] - fpfour * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx_d + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx_d - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp_d * ( u[tiled_index(k, j, i-2, m)] - fpfour * u[tiled_index(k, j, i-1, m)] + fpsix * u[tiled_index(k, j, i, m)] - fpfour * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx_d - 1) { rsd[tiled_index(k, j, nx_d-1, m)] -= dssp_d * ( u[tiled_index(k, j, nx_d-3, m)] - fpfour * u[tiled_index(k, j, nx_d-2, m)] + fpsix * u[tiled_index(k, j, nx_d-1, m)] - fpfour * u[tiled_index(k, j, nx_d, m)] ); } if (i == nx_d) { rsd[tiled_index(k, j, nx_d, m)] -= dssp_d * ( u[tiled_index(k, j, nx_d-2, m)] - fpfour * u[tiled_index(k, j, nx_d-1, m)] + fpfive * u[tiled_index(k, j, nx_d, m)] ); } } } } } cuda/kernels/rhs/xi/rhs_xi4.cuh0000644000175600017620000000410111440545402015110 0ustar sjpsjp/** * CUDA kernel for the fourth part of xi-direction flux differences. * Update rsd based on u. */ __global__ void rhs_xi4_kernel(fp_type* u, fp_type* rsd, fp_type* flux) { /** * Local variables. */ int i, j, k; // Some constants. const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type fptwo = 2.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check that j and k are allowable values. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { rsd[tiled_index(k, j, i, 0)] += dx1_d * tx1_d * ( u[tiled_index(k, j, i-1, 0)] - fptwo * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3_d * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2_d * tx1_d * ( u[tiled_index(k, j, i-1, 1)] - fptwo * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3_d * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3_d * tx1_d * ( u[tiled_index(k, j, i-1, 2)] - fptwo * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3_d * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4_d * tx1_d * ( u[tiled_index(k, j, i-1, 3)] - fptwo * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3_d * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5_d * tx1_d * ( u[tiled_index(k, j, i-1, 4)] - fptwo * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } cuda/kernels/rhs/xi/rhs_xi3.cuh0000644000175600017620000000371111440546203015115 0ustar sjpsjp/** * CUDA kernel for the third part of xi-direction flux differences. * Update flux (again) based on u. */ template __global__ void rhs_xi3_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ int i, j, k; int L2; fp_type u21i, u31i, u41i, u51i; fp_type u21im1, u31im1, u41im1, u51im1; fp_type tmp; // Some constants. const fp_type c1 = c1_def; const fp_type c5 = c5_def; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type fpsix = 6.0e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Update L2 based on south. if (south != -1) { L2 = nx_d + 2; } if (south == -1) { L2 = nx_d + 1; } // Check that j and k are allowable values. if (i >= ist_d && i <= L2 && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { tmp = fpone / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = fpone / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( fpfour / fpthree ) * tx3_d * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3_d * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3_d * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = fphalf * ( fpone - c1 * c5 ) * tx3_d * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (fpone/fpsix) * tx3_d * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3_d * ( u51i - u51im1 ); } } cuda/kernels/rhs/xi/rhs_xi2.cuh0000644000175600017620000000212411440474332015114 0ustar sjpsjp/** * CUDA kernel for the second part of xi-direction flux differences. * Update rsd based on flux. */ __global__ void rhs_xi2_kernel(fp_type* rsd, fp_type* flux) { /** * Local variables. */ int i, j, k; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Check that j and k are allowable values. if (i >= ist_d && i <= iend_d && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { rsd[tiled_index(k, j, i, 0)] -= tx2_d * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2_d * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2_d * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2_d * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2_d * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } cuda/kernels/rhs/xi/rhs_xi1.cuh0000644000175600017620000000303611440545223015114 0ustar sjpsjp/** * CUDA kernel for the first part of xi-direction flux differences. * Update flux based on u. */ template __global__ void rhs_xi1_kernel(fp_type* u, fp_type* flux) { /** * Local variables. */ int i, j, k; fp_type q, u21; int L1, L2; // Some constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type fphalf = 0.50e+00; // Calculate i and j values. i = (blockIdx.x * rhsblock_x) + threadIdx.x; j = ((blockIdx.y % rhsgrid_y) * rhsblock_y) + threadIdx.y; k = (blockIdx.y / rhsgrid_y); // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx_d + 2; } if (south == -1) { L2 = nx_d + 1; } if (i >= L1 && i <= L2 && j >= jst_d && j <= jend_d && k >= 1 && k <= nz_d - 2) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = fphalf * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } headers/util.h0000644000175600017620000000054511477715041012015 0ustar sjpsjp#ifndef __UTIL_H__ #define __UTIL_H__ #include "applu.h" #include "timers.h" #define min(x,y) (x < y ? x : y) #define max(x,y) (x > y ? x : y) extern void swap_pointers(fp_type** x, fp_type** y); extern void parse_options(int argc, char* argv[]); extern void print_timers(); extern void allocate_buffers(); extern void free_buffers(); #endif headers/types.h0000644000175600017620000000050311437672221012174 0ustar sjpsjp/** * Kind of allows us to emulate the and templates * from C++. */ #ifndef __TYPES_H__ #define __TYPES_H__ #ifdef SINGLE_PRECISION_LU typedef float fp_type; #define MPI_FP_TYPE MPI_FLOAT #endif #ifdef DOUBLE_PRECISION_LU typedef double fp_type; #define MPI_FP_TYPE MPI_DOUBLE #endif #endif headers/timers.h0000644000175600017620000000133011477420037012332 0ustar sjpsjp#ifndef __TIMERS_H__ #define __TIMERS_H__ /** * Timer struct. */ typedef struct { double cpu_start; double cpu_end; double wall_start; double wall_end; double cpu_total; double wall_total; int calls; } Timer; extern void timer_init(int number); extern void timer_reset(); extern void timer_start(int i); extern void timer_stop(int i); extern void timer_pause(int i); extern void timer_restart(int i); extern void timer_finalize(); extern double timer_cpu_total(int i); extern double timer_cpu_avg(int i); extern double timer_wall_total(int i); extern double timer_wall_avg(int i); extern void timer_print_format(); extern void timer_print(const char* name, int i); #endif headers/mpinpb.h0000666000175600017620000000032211420014630012302 0ustar sjpsjp /** * Shared header file for anything using MPI. */ #include extern int node, no_nodes, root, comm_setup, comm_solve, comm_rhs; // Removed dp_type, since we know we are using MPI_DOUBLE everywhere. headers/globals.h0000644000175600017620000000220311477673371012465 0ustar sjpsjp/** * Definition of global variables defined by applu.h * * Note: This file should ONLY be included by main.c. * "#include applu.h" is sufficient to access these variables. */ int verified; fp_type mflops; char class; int nx, ny, nz; int nx0, ny0, nz0; int ipt, ist, iend; int jpt, jst, jend; int ii1, ii2; int ji1, ji2; int ki1, ki2; fp_type dxi, deta, dzeta; fp_type tx1, tx2, tx3; fp_type ty1, ty2, ty3; fp_type tz1, tz2, tz3; fp_type dx1, dx2, dx3, dx4, dx5; fp_type dy1, dy2, dy3, dy4, dy5; fp_type dz1, dz2, dz3, dz4, dz5; fp_type dssp; fp_type *u_flat, *rsd_flat, *frct_flat, *flux_flat; fp_type ****u, ****rsd, ****frct, ****flux; int ipr, inorm; int itmax, invert; fp_type dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal; fp_type *a_flat, *b_flat, *c_flat, *d_flat; fp_type a, b, c, d; fp_type ce[13][5]; int id, ndim, num, xdim, ydim, row, col; int north, south, east, west; int icommn[npmax+1], icomms[npmax+1], icomme[npmax+1], icommw[npmax+1]; fp_type *buf_flat, *buf1_flat; fp_type **buf, **buf1; fp_type *jbuf_flat, *ibuf_flat; fp_type ***jbuf, ***ibuf; fp_type maxtime; // Some MPI stuff. int root; headers/functions.h0000644000175600017620000000341611440475375013053 0ustar sjpsjp#ifndef __APPLU_FUNCTIONS__ #define __APPLU_FUNCTIONS__ extern void bcast_inputs(); extern void blts ( int ldmx, int ldmy, int ldmz, int nx, int ny, int nz, int starting_k, fp_type omega, fp_type**** v, fp_type ldz, fp_type ldy, fp_type ldx, fp_type d, int ist, int iend, int jst, int jend, int nx0, int ny0, int ipt, int jpt); extern void buts ( int ldmx, int ldmy, int ldmz, int nx, int ny, int nz, int starting_k, fp_type omega, fp_type**** v, fp_type*** tv, fp_type d, fp_type udx, fp_type udy, fp_type udz, int ist, int iend, int jst, int jend, int nx0, int ny0, int ipt, int jpt ); extern void erhs(); extern void error(); extern void exact(int i, int j, int k, fp_type u000ijk[5]); extern void exchange_1(fp_type**** g, int k, int iex); extern void exchange_3(fp_type**** g, int iex); extern void exchange_4(fp_type** g, fp_type** h, int ibeg, int ifin1, int jbeg, int jfin1); extern void exchange_5(fp_type** g, int ibeg, int ifin1); extern void exchange_6(fp_type** g, int jbeg, int jfin1); extern void init_comm(int argc, char** argv); extern void jacld(int starting_k); extern void jacu(int starting_k); extern void l2norm (int ldx, int ldy, int ldz, int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend, fp_type**** v, fp_type sum[5]); extern void neighbors(); extern int nodedim(int num); extern void pintgr(); extern void proc_grid(); extern void read_input(); extern void rhs(); extern void setbv(); extern void setcoeff(); extern void sethyper(); extern void setiv(); extern void ssor(int niter); extern void subdomain(); extern int verify(fp_type xcr[5], fp_type xce[5], fp_type xci, char class_string); #endif headers/applu_cuda.h0000644000175600017620000000234111457037577013162 0ustar sjpsjp/** * Header for useful CUDA stuff. */ #ifndef __APPLU_CUDA_H__ #define __APPLU_CUDA_H__ #include "types.h" typedef struct { // Solution arrays. fp_type* u; fp_type* rsd; fp_type* frct; fp_type* flux; // Array for l2norm reduction. fp_type* sum; // Rearrangement buffer. fp_type* rearrangement; // Lookup tables. int* columns; int* rows; int* wave2d_offsets; int* wave3d_offsets; int* thread_map; // Ex1 buffers. fp_type* ibuffer; fp_type* jbuffer; // Ex3 buffers. fp_type* buf; fp_type* buf1; } luBuffers; // Prototype functions. extern void allocate_buffers(luBuffers* buffers); extern void free_buffers(luBuffers* buffers); extern void prepare_lookup_tables(int* wave2d_offsets, int* wave3d_offsets, int* columns, int* rows, int* thread_map, luBuffers* buffers); extern void print_gpu_info(); extern void flat_to_tiled(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers); extern void tiled_to_flat(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers); extern void tiled_to_hyperplane(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers); extern void hyperplane_to_tiled(fp_type* buffer, fp_type* rearrangement, luBuffers* buffers); #endif headers/applu.h0000644000175600017620000000447111477726362012173 0ustar sjpsjp// C port of NPB3.2 // applu.h // size.h defines problem and decomposition sizes. #include "size.h" #include "types.h" /** * isiz01,02,03 give maximum size. * ipr = 1 to print out verbose information. * omega = 2.0 is correct for all classes. * tolrsd is tolerance levels for steady state residuals. */ #define ipr_default 1 #define omega_default (1.20e+00) #define tolrsd1_def (1.0e-08) #define tolrsd2_def (1.0e-08) #define tolrsd3_def (1.0e-08) #define tolrsd4_def (1.0e-08) #define tolrsd5_def (1.0e-08) #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) /** * Grid. */ extern int nx, ny, nz; extern int nx0, ny0, nz0; extern int ipt, ist, iend; extern int jpt, jst, jend; extern int ii1, ii2; extern int ji1, ji2; extern int ki1, ki2; extern fp_type dxi, deta, dzeta; extern fp_type tx1, tx2, tx3; extern fp_type ty1, ty2, ty3; extern fp_type tz1, tz2, tz3; /** * Dissipation. */ extern fp_type dx1, dx2, dx3, dx4, dx5; extern fp_type dy1, dy2, dy3, dy4, dy5; extern fp_type dz1, dz2, dz3, dz4, dz5; extern fp_type dssp; /** * Field variables and residuals. */ extern fp_type* u_flat; extern fp_type* rsd_flat; extern fp_type* frct_flat; extern fp_type* flux_flat; extern fp_type**** u; extern fp_type**** rsd; extern fp_type**** frct; extern fp_type**** flux; /** * Output control parameters. */ extern int ipr, inorm; /** * Newton-raphson iteration control parameters. */ extern int itmax, invert; extern fp_type dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal; extern fp_type* a_flat; extern fp_type* b_flat; extern fp_type* c_flat; extern fp_type* d_flat; extern fp_type a; extern fp_type b; extern fp_type c; extern fp_type d; /** * Coefficients of the exact solution. */ extern fp_type ce[13][5]; /** * Multi-processor common blocks. */ extern int id, ndim, num, xdim, ydim, row, col; extern int north, south, east, west; #define from_s 1 #define from_n 2 #define from_e 3 #define from_w 4 #define npmax (isiz01 + isiz02) extern int icommn[npmax+1], icomms[npmax+1], icomme[npmax+1], icommw[npmax+1]; extern fp_type *buf_flat, *buf1_flat; extern fp_type **buf, **buf1; extern fp_type *jbuf_flat, *ibuf_flat; extern fp_type ***jbuf, ***ibuf; extern fp_type maxtime; // Function prototypes. #include "functions.h" headers/alloc.h0000644000175600017620000000065611440472621012127 0ustar sjpsjpextern fp_type**** alloc_ursd(fp_type* space); extern void free_ursd(fp_type**** pointers); extern fp_type alloc_abcd(fp_type* space); extern void free_abcd(fp_type pointers); extern fp_type** alloc_buffer(fp_type* space); extern void free_buffer(fp_type **pointers); fp_type*** alloc_ibuf(fp_type* space); void free_ibuf(fp_type*** pointers); fp_type*** alloc_jbuf(fp_type* space); void free_jbuf(fp_type*** pointers); alloc.c0000644000175600017620000000744211445702447010516 0ustar sjpsjp#include #include #include "applu.h" /** * Set up pointers for a 4D ursd array using the provided contiguous space. */ fp_type**** alloc_ursd(fp_type* space) { int k, j, i; fp_type **** pointers; // Now allocate all of our pointer space. pointers = malloc(isiz3 * sizeof(fp_type***)); for (k = 0; k < isiz3; k++) { pointers[k] = malloc( (isiz2 + 4) * sizeof(fp_type**) ); for (j = 0; j < (isiz2 + 4); j++) { pointers[k][j] = malloc( (isiz1 + 4) * sizeof(fp_type*) ); for (i = 0; i < (isiz1 + 4); i++) { // Calculate i, j, k index. int index = (k * (isiz2 + 4) * (isiz1 + 4) * 5) + (j * (isiz1 + 4) * 5) + (i * 5); pointers[k][j][i] = space + index; } } } return pointers; } /** * Free a memory for a ursd array. */ void free_ursd(fp_type**** pointers) { int k, j, i; for (k = 0; k < isiz3; k++) { for (j = 0; j < (isiz2 + 4); j++) { free(pointers[k][j]); } free(pointers[k]); } free(pointers); } /** * Set up pointers for a 5D abcd array using the provided contiguous space. */ fp_type alloc_abcd(fp_type* space) { int k, j, i, m; fp_type pointers; // Now allocate all of our pointer space. pointers = malloc(kblock * sizeof(fp_type****)); for (k = 0; k < kblock; k++) { pointers[k] = malloc( (isiz2 + 4) * sizeof(fp_type***) ); for (j = 0; j < (isiz2 + 4); j++) { pointers[k][j] = malloc( (isiz1 + 4) * sizeof(fp_type**) ); for (i = 0; i < (isiz1 + 4); i++) { pointers[k][j][i] = malloc ( 5 * sizeof(fp_type*) ); for (m = 0; m < 5; m++) { // Calculate i, j, k, m index. int index = (k * (isiz2 + 4) * (isiz1 + 4) * 5 * 5) + (j * (isiz1 + 4) * 5 * 5) + (i * 5 * 5) + (m * 5); pointers[k][j][i][m] = space + index; } } } } return pointers; } void free_abcd(fp_type pointers) { int k, j, i, m; for (k = 0; k < kblock; k++) { for (j = 0; j < (isiz2 + 4); j++) { for (i = 0; i < (isiz1 + 4); i++) { free(pointers[k][j][i]); } free(pointers[k][j]); } free(pointers[k]); } free(pointers); } /** * Set up 2D pointers for our buffer. */ fp_type** alloc_buffer(fp_type* space) { int k; fp_type** pointers; // Now allocate all of our pointer space. pointers = malloc( (isiz3 * isiz2 * 2) * sizeof(fp_type*) ); for (k = 0; k < (isiz3 * isiz2 * 2); k++) { // Calculate k, m, index. int index = k * 5; pointers[k] = space + index; } return pointers; } void free_buffer(fp_type** pointers) { free(pointers); } /** * Set up 2D pointers for our ibuf. */ fp_type*** alloc_ibuf(fp_type* space) { int i, k, m; fp_type*** pointers; // Now allocate all of our pointer space. pointers = malloc( kblock * sizeof(fp_type**) ); for (k = 0; k < kblock; k++) { pointers[k] = malloc( (iend - ist + 1) * sizeof(fp_type*) ); for (i = 0; i < (iend - ist + 1); i++) { int index = (k * (iend - ist + 1) * 5) + (i * 5); pointers[k][i] = space + index; } } return pointers; } void free_ibuf(fp_type*** pointers) { int k; for (k = 0; k < kblock; k++) { free(pointers[k]); } free(pointers); } /** * Set up 2D pointers for our jbuf. */ fp_type*** alloc_jbuf(fp_type* space) { int j, k, m; fp_type*** pointers; // Now allocate all of our pointer space. pointers = malloc( kblock * sizeof(fp_type**) ); for (k = 0; k < kblock; k++) { pointers[k] = malloc( (jend - jst + 1) * sizeof(fp_type*) ); for (j = 0; j < (jend - jst + 1); j++) { int index = (k * (jend - jst + 1) * 5) + (j * 5); pointers[k][j] = space + index; } } return pointers; } void free_jbuf(fp_type*** pointers) { int k; for (k = 0; k < kblock; k++) { free(pointers[k]); } free(pointers); } bcast_inputs.c0000666000175600017620000000170311440467645012123 0ustar sjpsjp// C port of NPB3.2 // subroutine bcast_inputs #include "applu.h" #include "mpinpb.h" void bcast_inputs() { root = 0; /** * Root broadcasts the data. * The data isn't contiguous or of the same type, so it's not * clear how to send it in the "MPI" way. * We could pack the info into a buffer or we could create * an obscene datatype to handle it all at once. Since we only * broadcast the data once, just use a separate broadcast for * each piece. */ MPI_Bcast(&ipr, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&inorm, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&itmax, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&dt, 1, MPI_FP_TYPE, root, MPI_COMM_WORLD); MPI_Bcast(&omega, 1, MPI_FP_TYPE, root, MPI_COMM_WORLD); MPI_Bcast(&tolrsd, 5, MPI_FP_TYPE, root, MPI_COMM_WORLD); MPI_Bcast(&nx0, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&ny0, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nz0, 1, MPI_INT, root, MPI_COMM_WORLD); } blts.c0000646000175600017620000001401411440535071010353 0ustar sjpsjp// C port of NPB3.2 // subroutine blts #include "applu.h" /** * Compute the regular-sparse, block lower triangular solution. * v <-- ( L-inv ) * v */ void blts ( const int ldmx, const int ldmy, const int ldmz, const int nx, const int ny, const int nz, const int starting_k, fp_type omega, fp_type**** v, fp_type ldz, fp_type ldy, fp_type ldx, fp_type d, const int ist, const int iend, const int jst, const int jend, const int nx0, const int ny0, const int ipt, const int jpt) { /** * Local variables. */ int i, j, k, m; int iex; fp_type tmp, tmp1; fp_type tmat[5][5]; // Constants. const fp_type fpone = 1.0e+00; // Receive data from north and west. iex = 0; exchange_1(v, starting_k, iex); for (k = starting_k; k < starting_k + kblock; k++) { int level = k % kblock; for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { v[k][j][i][m] = v[k][j][i][m] - omega * ( ldz[level][j][i][0][m] * v[k-1][j][i][0] + ldz[level][j][i][1][m] * v[k-1][j][i][1] + ldz[level][j][i][2][m] * v[k-1][j][i][2] + ldz[level][j][i][3][m] * v[k-1][j][i][3] + ldz[level][j][i][4][m] * v[k-1][j][i][4] ); } } } } for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { v[k][j][i][m] = v[k][j][i][m] - omega * ( ldy[level][j][i][0][m] * v[k][j-1][i][0] + ldx[level][j][i][0][m] * v[k][j][i-1][0] + ldy[level][j][i][1][m] * v[k][j-1][i][1] + ldx[level][j][i][1][m] * v[k][j][i-1][1] + ldy[level][j][i][2][m] * v[k][j-1][i][2] + ldx[level][j][i][2][m] * v[k][j][i-1][2] + ldy[level][j][i][3][m] * v[k][j-1][i][3] + ldx[level][j][i][3][m] * v[k][j][i-1][3] + ldy[level][j][i][4][m] * v[k][j-1][i][4] + ldx[level][j][i][4][m] * v[k][j][i-1][4] ); } /** * Diagonal block inversion. * Forward elimination. */ for (m = 0; m < 5; m++) { tmat[0][m] = d[level][j][i][0][m]; tmat[1][m] = d[level][j][i][1][m]; tmat[2][m] = d[level][j][i][2][m]; tmat[3][m] = d[level][j][i][3][m]; tmat[4][m] = d[level][j][i][4][m]; } // ip = 0. tmp1 = fpone / tmat[0][0]; tmp = tmp1 * tmat[0][1]; tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; v[k][j][i][1] = v[k][j][i][1] - v[k][j][i][0] * tmp; tmp = tmp1 * tmat[0][2]; tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; v[k][j][i][2] = v[k][j][i][2] - v[k][j][i][0] * tmp; tmp = tmp1 * tmat[0][3]; tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; v[k][j][i][3] = v[k][j][i][3] - v[k][j][i][0] * tmp; tmp = tmp1 * tmat[0][4]; tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; v[k][j][i][4] = v[k][j][i][4] - v[k][j][i][0] * tmp; // ip = 1. tmp1 = fpone / tmat[1][1]; tmp = tmp1 * tmat[1][2]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; v[k][j][i][2] = v[k][j][i][2] - v[k][j][i][1] * tmp; tmp = tmp1 * tmat[1][3]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; v[k][j][i][3] = v[k][j][i][3] - v[k][j][i][1] * tmp; tmp = tmp1 * tmat[1][4]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; v[k][j][i][4] = v[k][j][i][4] - v[k][j][i][1] * tmp; // ip = 2 tmp1 = fpone / tmat[2][2]; tmp = tmp1 * tmat[2][3]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; v[k][j][i][3] = v[k][j][i][3] - v[k][j][i][2] * tmp; tmp = tmp1 * tmat[2][4]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; v[k][j][i][4] = v[k][j][i][4] - v[k][j][i][2] * tmp; // ip = 3 tmp1 = fpone / tmat[3][3]; tmp = tmp1 * tmat[3][4]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; v[k][j][i][4] = v[k][j][i][4] - v[k][j][i][3] * tmp; /** * Back substitution. */ v[k][j][i][4] = v[k][j][i][4] / tmat[4][4]; v[k][j][i][3] = v[k][j][i][3] - tmat[4][3] * v[k][j][i][4]; v[k][j][i][3] = v[k][j][i][3] / tmat[3][3]; v[k][j][i][2] = v[k][j][i][2] - tmat[3][2] * v[k][j][i][3] - tmat[4][2] * v[k][j][i][4]; v[k][j][i][2] = v[k][j][i][2] / tmat[2][2]; v[k][j][i][1] = v[k][j][i][1] - tmat[2][1] * v[k][j][i][2] - tmat[3][1] * v[k][j][i][3] - tmat[4][1] * v[k][j][i][4]; v[k][j][i][1] = v[k][j][i][1] / tmat[1][1]; v[k][j][i][0] = v[k][j][i][0] - tmat[1][0] * v[k][j][i][1] - tmat[2][0] * v[k][j][i][2] - tmat[3][0] * v[k][j][i][3] - tmat[4][0] * v[k][j][i][4]; v[k][j][i][0] = v[k][j][i][0] / tmat[0][0]; } } } } // Send data to south and east. iex = 2; exchange_1(v, starting_k + kblock - 1, iex); } buts.c0000646000175600017620000001414011440535052010363 0ustar sjpsjp// C port of NPB3.2 // subroutine buts #include "applu.h" /** * Compute the regular-sparse, block upper triangular solution. * v <-- ( U-inv ) * v */ void buts ( const int ldmx, const int ldmy, const int ldmz, const int nx, const int ny, const int nz, const int starting_k, fp_type omega, fp_type**** v, fp_type*** tv, fp_type d, fp_type udx, fp_type udy, fp_type udz, const int ist, const int iend, const int jst, const int jend, const int nx0, const int ny0, const int ipt, const int jpt ) { /** * Local variables. */ int i, j, k, m; int iex; fp_type tmp, tmp1; fp_type tmat[5][5]; // Constants. const fp_type fpone = 1.0e+00; // Receive data from south and east. iex = 1; exchange_1(v, starting_k, iex); for (k = starting_k; k > starting_k - kblock; k--) { int level = k % kblock; for (j = ny + 3; j >= 0; j--) { for (i = nx + 3; i >= 0; i--) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { tv[j][i][m] = omega * ( udz[level][j][i][0][m] * v[k+1][j][i][0] + udz[level][j][i][1][m] * v[k+1][j][i][1] + udz[level][j][i][2][m] * v[k+1][j][i][2] + udz[level][j][i][3][m] * v[k+1][j][i][3] + udz[level][j][i][4][m] * v[k+1][j][i][4] ); } } } } for (j = ny + 3; j >= 0; j--) { for (i = nx + 3; i >= 0; i--) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { tv[j][i][m] = tv[j][i][m] + omega * ( udy[level][j][i][0][m] * v[k][j+1][i][0] + udx[level][j][i][0][m] * v[k][j][i+1][0] + udy[level][j][i][1][m] * v[k][j+1][i][1] + udx[level][j][i][1][m] * v[k][j][i+1][1] + udy[level][j][i][2][m] * v[k][j+1][i][2] + udx[level][j][i][2][m] * v[k][j][i+1][2] + udy[level][j][i][3][m] * v[k][j+1][i][3] + udx[level][j][i][3][m] * v[k][j][i+1][3] + udy[level][j][i][4][m] * v[k][j+1][i][4] + udx[level][j][i][4][m] * v[k][j][i+1][4] ); } /** * Diagonal block inversion. */ for (m = 0; m < 5; m++) { tmat[0][m] = d[level][j][i][0][m]; tmat[1][m] = d[level][j][i][1][m]; tmat[2][m] = d[level][j][i][2][m]; tmat[3][m] = d[level][j][i][3][m]; tmat[4][m] = d[level][j][i][4][m]; } // ip = 0. tmp1 = fpone / tmat[0][0]; tmp = tmp1 * tmat[0][1]; tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; tv[j][i][1] = tv[j][i][1] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][2]; tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; tv[j][i][2] = tv[j][i][2] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][3]; tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; tv[j][i][3] = tv[j][i][3] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][4]; tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; tv[j][i][4] = tv[j][i][4] - tv[j][i][0] * tmp; // ip = 1. tmp1 = fpone / tmat[1][1]; tmp = tmp1 * tmat[1][2]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; tv[j][i][2] = tv[j][i][2] - tv[j][i][1] * tmp; tmp = tmp1 * tmat[1][3]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; tv[j][i][3] = tv[j][i][3] - tv[j][i][1] * tmp; tmp = tmp1 * tmat[1][4]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; tv[j][i][4] = tv[j][i][4] - tv[j][i][1] * tmp; // ip = 2. tmp1 = fpone / tmat[2][2]; tmp = tmp1 * tmat[2][3]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; tv[j][i][3] = tv[j][i][3] - tv[j][i][2] * tmp; tmp = tmp1 * tmat[2][4]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; tv[j][i][4] = tv[j][i][4] - tv[j][i][2] * tmp; // ip = 3. tmp = fpone / tmat[3][3]; tmp = tmp1 * tmat[3][4]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; tv[j][i][4] = tv[j][i][4] - tv[j][i][3] * tmp; /** * Back substitution. */ tv[j][i][4] = tv[j][i][4] / tmat[4][4]; tv[j][i][3] = tv[j][i][3] - tmat[4][3] * tv[j][i][4]; tv[j][i][3] = tv[j][i][3] / tmat[3][3]; tv[j][i][2] = tv[j][i][2] - tmat[3][2] * tv[j][i][3] - tmat[4][2] * tv[j][i][4]; tv[j][i][2] = tv[j][i][2] / tmat[2][2]; tv[j][i][1] = tv[j][i][1] - tmat[2][1] * tv[j][i][2] - tmat[3][1] * tv[j][i][3] - tmat[4][1] * tv[j][i][4]; tv[j][i][1] = tv[j][i][1] / tmat[1][1]; tv[j][i][0] = tv[j][i][0] - tmat[1][0] * tv[j][i][1] - tmat[2][0] * tv[j][i][2] - tmat[3][0] * tv[j][i][3] - tmat[4][0] * tv[j][i][4]; tv[j][i][0] = tv[j][i][0] / tmat[0][0]; v[k][j][i][0] = v[k][j][i][0] - tv[j][i][0]; v[k][j][i][1] = v[k][j][i][1] - tv[j][i][1]; v[k][j][i][2] = v[k][j][i][2] - tv[j][i][2]; v[k][j][i][3] = v[k][j][i][3] - tv[j][i][3]; v[k][j][i][4] = v[k][j][i][4] - tv[j][i][4]; } } } } // Send data to north and west. iex = 3; exchange_1(v, starting_k - kblock + 1, iex); } erhs.c0000646000175600017620000003743411544625762010377 0ustar sjpsjp// C port of NPB3.2 // subroutine erhs #include "applu.h" /** * Compute the right hand side based on exact solution. */ void erhs() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; int iex; int L1, L2; int ist1, iend1; int jst1, jend1; fp_type dsspm; fp_type xi, eta, zeta; fp_type q; fp_type u21, u31, u41; fp_type tmp; fp_type u21i, u31i, u41i, u51i; fp_type u21j, u31j, u41j, u51j; fp_type u21k, u31k, u41k, u51k; fp_type u21im1, u31im1, u41im1, u51im1; fp_type u21jm1, u31jm1, u41jm1, u51jm1; fp_type u21km1, u31km1, u41km1, u51km1; dsspm = dssp; // Constants. const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { for (i = 2; i <= nx + 1; i++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = fpzero; } } } } for (k = 0; k <= nz - 1; k++) { zeta = ( (fp_type) (k) ) / (nz - 1); for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; eta = ( (fp_type) (jglob - 2) ) / (ny0 - 1); for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; xi = ( (fp_type) (iglob - 2) ) / (nx0 - 1); for (m = 0; m < 5; m++) { rsd[k][j][i][m] = ce[0][m] + ce[1][m] * xi + ce[2][m] * eta + ce[3][m] * zeta + ce[4][m] * xi * xi + ce[5][m] * eta * eta + ce[6][m] * zeta * zeta + ce[7][m] * xi * xi * xi + ce[8][m] * eta * eta * eta + ce[9][m] * zeta * zeta * zeta + ce[10][m] * xi * xi * xi * xi + ce[11][m] * eta * eta * eta * eta + ce[12][m] * zeta * zeta * zeta * zeta; } } } } /** * xi-direction flux differences. * * iex = flag : iex = 0 north/south communication. * : iex = 1 east/west communication. */ iex = 0; // Communicate and receive/send two rows of data. exchange_3 (rsd, iex); L1 = 1; if (north == -1) { L1 = 2; } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = L1; i <= L2; i++) { flux[k][j][i][0] = rsd[k][j][i][1]; u21 = rsd[k][j][i][1] / rsd[k][j][i][0]; q = fphalf * ( rsd[k][j][i][1] * rsd[k][j][i][1] + rsd[k][j][i][2] * rsd[k][j][i][2] + rsd[k][j][i][3] * rsd[k][j][i][3] ) / rsd[k][j][i][0]; flux[k][j][i][1] = rsd[k][j][i][1] * u21 + c2 * ( rsd[k][j][i][4] - q ); flux[k][j][i][2] = rsd[k][j][i][2] * u21; flux[k][j][i][3] = rsd[k][j][i][3] * u21; flux[k][j][i][4] = ( c1 * rsd[k][j][i][4] - c2 * q ) * u21; } } } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - tx2 * ( flux[k][j][i+1][m] - flux[k][j][i-1][m] ); } } for (i = ist; i <= L2; i++) { tmp = fpone / rsd[k][j][i][0]; u21i = tmp * rsd[k][j][i][1]; u31i = tmp * rsd[k][j][i][2]; u41i = tmp * rsd[k][j][i][3]; u51i = tmp * rsd[k][j][i][4]; tmp = fpone / rsd[k][j][i-1][0]; u21im1 = tmp * rsd[k][j][i-1][1]; u31im1 = tmp * rsd[k][j][i-1][2]; u41im1 = tmp * rsd[k][j][i-1][3]; u51im1 = tmp * rsd[k][j][i-1][4]; flux[k][j][i][1] = (fpfour/fpthree) * tx3 * ( u21i - u21im1 ); flux[k][j][i][2] = tx3 * ( u31i - u31im1 ); flux[k][j][i][3] = tx3 * ( u41i - u41im1 ); flux[k][j][i][4] = fphalf * ( fpone - c1*c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (fpone/fpsix) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } for (i = ist; i <= iend; i++) { frct[k][j][i][0] = frct[k][j][i][0] + dx1 * tx1 * ( rsd[k][j][i-1][0] - fptwo * rsd[k][j][i][0] + rsd[k][j][i+1][0] ); frct[k][j][i][1] = frct[k][j][i][1] + tx3 * c3 * c4 * ( flux[k][j][i+1][1] - flux[k][j][i][1] ) + dx2 * tx1 * ( rsd[k][j][i-1][1] - fptwo * rsd[k][j][i][1] + rsd[k][j][i+1][1] ); frct[k][j][i][2] = frct[k][j][i][2] + tx3 * c3 * c4 * ( flux[k][j][i+1][2] - flux[k][j][i][2] ) + dx3 * tx1 * ( rsd[k][j][i-1][2] - fptwo * rsd[k][j][i][2] + rsd[k][j][i+1][2] ); frct[k][j][i][3] = frct[k][j][i][3] + tx3 * c3 * c4 * ( flux[k][j][i+1][3] - flux[k][j][i][3] ) + dx4 * tx1 * ( rsd[k][j][i-1][3] - fptwo * rsd[k][j][i][3] + rsd[k][j][i+1][3] ); frct[k][j][i][4] = frct[k][j][i][4] + tx3 * c3 * c4 * ( flux[k][j][i+1][4] - flux[k][j][i][4] ) + dx5 * tx1 * ( rsd[k][j][i-1][4] - fptwo * rsd[k][j][i][4] + rsd[k][j][i+1][4] ); } /** * Fourth-order dissipation. */ if (north == -1) { for (m = 0; m < 5; m++) { frct[k][j][3][m] = frct[k][j][3][m] - dsspm * ( + fpfive * rsd[k][j][3][m] - fpfour * rsd[k][j][4][m] + rsd[k][j][5][m] ); frct[k][j][4][m] = frct[k][j][4][m] - dsspm * ( - fpfour * rsd[k][j][3][m] + fpsix * rsd[k][j][4][m] - fpfour * rsd[k][j][5][m] + rsd[k][j][6][m] ); } } ist1 = 2; iend1 = nx + 1; if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } for (i = ist1; i <= iend1; i++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - dsspm * ( rsd[k][j][i-2][m] - fpfour * rsd[k][j][i-1][m] + fpsix * rsd[k][j][i][m] - fpfour * rsd[k][j][i+1][m] + rsd[k][j][i+2][m] ); } } if (south == -1) { for (m = 0; m < 5; m++) { frct[k][j][nx-1][m] = frct[k][j][nx-1][m] - dsspm * ( rsd[k][j][nx-3][m] - fpfour * rsd[k][j][nx-2][m] + fpsix * rsd[k][j][nx-1][m] - fpfour * rsd[k][j][nx][m] ); frct[k][j][nx][m] = frct[k][j][nx][m] - dsspm * ( rsd[k][j][nx-2][m] - fpfour * rsd[k][j][nx-1][m] + fpfive * rsd[k][j][nx][m]); } } } } /** * eta-direction flux differences. * * iex = flag : iex = 0 north/south communication. * : iex = 1 east/west communication. */ iex = 1; // Communicate and receive/send two rows of data. exchange_3 (rsd, iex); L1 = 1; if (west == -1) { L1 = 2; } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = L1; j <= L2; j++) { flux[k][j][i][0] = rsd[k][j][i][2]; u31 = rsd[k][j][i][2] / rsd[k][j][i][0]; q = fphalf * ( rsd[k][j][i][1] * rsd[k][j][i][1] + rsd[k][j][i][2] * rsd[k][j][i][2] + rsd[k][j][i][3] * rsd[k][j][i][3] ) / rsd[k][j][i][0]; flux[k][j][i][1] = rsd[k][j][i][1] * u31; flux[k][j][i][2] = rsd[k][j][i][2] * u31 + c2 * (rsd[k][j][i][4] - q); flux[k][j][i][3] = rsd[k][j][i][3] * u31; flux[k][j][i][4] = ( c1 * rsd[k][j][i][4] - c2 * q ) * u31; } } } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = jst; j <= jend; j++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - ty2 * ( flux[k][j+1][i][m] - flux[k][j-1][i][m] ); } } for (j = jst; j <= L2; j++) { tmp = fpone / rsd[k][j][i][0]; u21j = tmp * rsd[k][j][i][1]; u31j = tmp * rsd[k][j][i][2]; u41j = tmp * rsd[k][j][i][3]; u51j = tmp * rsd[k][j][i][4]; tmp = fpone / rsd[k][j-1][i][0]; u21jm1 = tmp * rsd[k][j-1][i][1]; u31jm1 = tmp * rsd[k][j-1][i][2]; u41jm1 = tmp * rsd[k][j-1][i][3]; u51jm1 = tmp * rsd[k][j-1][i][4]; flux[k][j][i][1] = ty3 * ( u21j - u21jm1 ); flux[k][j][i][2] = (fpfour/fpthree) * ty3 * ( u31j - u31jm1 ); flux[k][j][i][3] = ty3 * ( u41j - u41jm1 ); flux[k][j][i][4] = fphalf * ( fpone - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (fpone/fpsix) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } for (j = jst; j <= jend; j++) { frct[k][j][i][0] = frct[k][j][i][0] + dy1 * ty1 * ( rsd[k][j-1][i][0] - fptwo * rsd[k][j][i][0] + rsd[k][j+1][i][0] ); frct[k][j][i][1] = frct[k][j][i][1] + ty3 * c3 * c4 * ( flux[k][j+1][i][1] - flux[k][j][i][1] ) + dy2 * ty1 * ( rsd[k][j-1][i][1] - fptwo * rsd[k][j][i][1] + rsd[k][j+1][i][1] ); frct[k][j][i][2] = frct[k][j][i][2] + ty3 * c3 * c4 * ( flux[k][j+1][i][2] - flux[k][j][i][2] ) + dy3 * ty1 * ( rsd[k][j-1][i][2] - fptwo * rsd[k][j][i][2] + rsd[k][j+1][i][2] ); frct[k][j][i][3] = frct[k][j][i][3] + ty3 * c3 * c4 * ( flux[k][j+1][i][3] - flux[k][j][i][3] ) + dy4 * ty1 * ( rsd[k][j-1][i][3] - fptwo * rsd[k][j][i][3] + rsd[k][j+1][i][3] ); frct[k][j][i][4] = frct[k][j][i][4] + ty3 * c3 * c4 * ( flux[k][j+1][i][4] - flux[k][j][i][4] ) + dy5 * ty1 * ( rsd[k][j-1][i][4] - fptwo * rsd[k][j][i][4] + rsd[k][j+1][i][4] ); } /** * Fourth-order dissipation. */ if (west == -1) { for (m = 0; m < 5; m++) { frct[k][3][i][m] = frct[k][3][i][m] - dsspm * ( + fpfive * rsd[k][3][i][m] - fpfour * rsd[k][4][i][m] + rsd[k][5][i][m] ); frct[k][4][i][m] = frct[k][4][i][m] - dsspm * ( - fpfour * rsd[k][3][i][m] + fpsix * rsd[k][4][i][m] - fpfour * rsd[k][5][i][m] + rsd[k][6][i][m] ); } } jst1 = 2; jend1 = ny + 1; if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } for (j = jst1; j <= jend1; j++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - dsspm * ( rsd[k][j-2][i][m] - fpfour * rsd[k][j-1][i][m] + fpsix * rsd[k][j][i][m] - fpfour * rsd[k][j+1][i][m] + rsd[k][j+2][i][m] ); } } if (east == -1) { for (m = 0; m < 5; m++) { frct[k][ny-1][i][m] = frct[k][ny-1][i][m] -dsspm * ( rsd[k][ny-3][i][m] - fpfour * rsd[k][ny-2][i][m] + fpsix * rsd[k][ny-1][i][m] - fpfour * rsd[k][ny][i][m] ); frct[k][ny][i][m] = frct[k][ny][i][m] -dsspm * ( rsd[k][ny-2][i][m] - fpfour * rsd[k][ny-1][i][m] + fpfive * rsd[k][ny][i][m] ); } } } } /** * zeta-direction flux differences. */ for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (k = 0; k <= nz - 1; k++) { flux[k][j][i][0] = rsd[k][j][i][3]; u41 = rsd[k][j][i][3] / rsd[k][j][i][0]; q = fphalf * ( rsd[k][j][i][1] * rsd[k][j][i][1] + rsd[k][j][i][2] * rsd[k][j][i][2] + rsd[k][j][i][3] * rsd[k][j][i][3] ) / rsd[k][j][i][0]; flux[k][j][i][1] = rsd[k][j][i][1] * u41; flux[k][j][i][2] = rsd[k][j][i][2] * u41; flux[k][j][i][3] = rsd[k][j][i][3] * u41 + c2 * ( rsd[k][j][i][4] - q ); flux[k][j][i][4] = ( c1 * rsd[k][j][i][4] - c2 * q ) * u41; } for (k = 1; k <= nz - 2; k++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - tz2 * ( flux[k+1][j][i][m] - flux[k-1][j][i][m] ); } } for (k = 1; k <= nz - 1; k++) { tmp = fpone / rsd[k][j][i][0]; u21k = tmp * rsd[k][j][i][1]; u31k = tmp * rsd[k][j][i][2]; u41k = tmp * rsd[k][j][i][3]; u51k = tmp * rsd[k][j][i][4]; tmp = fpone / rsd[k-1][j][i][0]; u21km1 = tmp * rsd[k-1][j][i][1]; u31km1 = tmp * rsd[k-1][j][i][2]; u41km1 = tmp * rsd[k-1][j][i][3]; u51km1 = tmp * rsd[k-1][j][i][4]; flux[k][j][i][1] = tz3 * ( u21k - u21km1 ); flux[k][j][i][2] = tz3 * ( u31k - u31km1 ); flux[k][j][i][3] = (fpfour/fpthree) * tz3 * ( u41k - u41km1 ); flux[k][j][i][4] = fphalf * ( fpone - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (fpone/fpsix) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } for (k = 1; k <= nz - 2; k++) { frct[k][j][i][0] = frct[k][j][i][0] + dz1 * tz1 * ( rsd[k+1][j][i][0] - fptwo * rsd[k][j][i][0] + rsd[k-1][j][i][0] ); frct[k][j][i][1] = frct[k][j][i][1] + tz3 * c3 * c4 * ( flux[k+1][j][i][1] - flux[k][j][i][1] ) + dz2 * tz1 * ( rsd[k+1][j][i][1] - fptwo * rsd[k][j][i][1] + rsd[k-1][j][i][1] ); frct[k][j][i][2] = frct[k][j][i][2] + tz3 * c3 * c4 * ( flux[k+1][j][i][2] - flux[k][j][i][2] ) + dz3 * tz1 * ( rsd[k+1][j][i][2] - fptwo * rsd[k][j][i][2] + rsd[k-1][j][i][2] ); frct[k][j][i][3] = frct[k][j][i][3] + tz3 * c3 * c4 * ( flux[k+1][j][i][3] - flux[k][j][i][3] ) + dz4 * tz1 * ( rsd[k+1][j][i][3] - fptwo * rsd[k][j][i][3] + rsd[k-1][j][i][3] ); frct[k][j][i][4] = frct[k][j][i][4] + tz3 * c3 * c4 * ( flux[k+1][j][i][4] - flux[k][j][i][4] ) + dz5 * tz1 * ( rsd[k+1][j][i][4] - fptwo * rsd[k][j][i][4] + rsd[k-1][j][i][4] ); } /** * Fourth-order dissipation. */ for (m = 0; m < 5; m++) { frct[1][j][i][m] = frct[1][j][i][m] - dsspm * ( + fpfive * rsd[1][j][i][m] - fpfour * rsd[2][j][i][m] + rsd[3][j][i][m] ); frct[2][j][i][m] = frct[2][j][i][m] - dsspm * ( - fpfour * rsd[1][j][i][m] + fpsix * rsd[2][j][i][m] - fpfour * rsd[3][j][i][m] + rsd[4][j][i][m] ); } for (k = 3; k <= nz - 4; k++) { for (m = 0; m < 5; m++) { frct[k][j][i][m] = frct[k][j][i][m] - dsspm * ( rsd[k-2][j][i][m] - fpfour * rsd[k-1][j][i][m] + fpsix * rsd[k][j][i][m] - fpfour * rsd[k+1][j][i][m] + rsd[k+2][j][i][m] ); } } for (m = 0; m < 5; m++) { frct[nz-3][j][i][m] = frct[nz-3][j][i][m] - dsspm * ( rsd[nz-5][j][i][m] - fpfour * rsd[nz-4][j][i][m] + fpsix * rsd[nz-3][j][i][m] - fpfour * rsd[nz-2][j][i][m] ); frct[nz-2][j][i][m] = frct[nz-2][j][i][m] - dsspm * ( rsd[nz-4][j][i][m] - fpfour * rsd[nz-3][j][i][m] + fpfive * rsd[nz-2][j][i][m] ); } } } } error.c0000666000175600017620000000245011440540707010545 0ustar sjpsjp// C port of NPB3.2 // subroutine error #include "applu.h" #include "mpinpb.h" #include /** * Compute the solution error. */ void error() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; fp_type tmp; fp_type u000ijk[5], dummy[5]; const fp_type fpzero = 0.0e+00; for (m = 0; m < 5; m++) { errnm[m] = fpzero; dummy[m] = fpzero; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { jglob = jpt + j; for (i = ist; i <= iend; i++) { iglob = ipt + i; exact(iglob, jglob, k, u000ijk); for (m = 0; m < 5; m++) { tmp = ( u000ijk[m] - u[k][j][i][m] ); dummy[m] = dummy[m] + (tmp * tmp); } } } } /** * Compute the global sum of individual contributions to dot product. */ MPI_Allreduce(dummy, errnm, 5, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); for (m = 0; m < 5; m++) { errnm[m] = sqrt ( errnm[m] / ( (nx0-2)*(ny0-2)*(nz0-2) ) ); } /* if (id != 0) { printf("RMS-norm of error in soln. to first pde = %d.\n", errnm[0]); printf("RMS-norm of error in soln. to second pde = %d.\n", errnm[1]); printf("RMS-norm of error in soln. to third pde = %d.\n", errnm[2]); printf("RMS-norm of error in soln. to fourth pde = %d.\n", errnm[3]); printf("RMS-norm of error in soln. to fifth pde = %d.\n", errnm[4]); }*/ } exact.c0000666000175600017620000000203211440470102010503 0ustar sjpsjp// C port of NPB3.2 // subroutine exact( i, j, k, u000ijk ) #include "applu.h" /** * Compute the exact solution at (i, j, k); */ void exact(int i, int j, int k, fp_type u000ijk[5]) { /** * Local variables. */ int m; fp_type xi, eta, zeta; // Note: Originally i-1, j-1, k-1; this may need fixing later. /*xi = ((fp_type) (i - 1)) / (nx0 - 1); eta = ((fp_type) (j - 1)) / (ny0 - 1); zeta = ((fp_type) (k - 1)) / (nz - 1);*/ xi = ((fp_type) (i - 2)) / (nx0 - 1); eta = ((fp_type) (j - 2)) / (ny0 - 1); zeta = ((fp_type) k) / (nz - 1); for (m = 0; m < 5; m++) { u000ijk[m] = ce[0][m] + ce[1][m] * xi + ce[2][m] * eta + ce[3][m] * zeta + ce[4][m] * xi * xi + ce[5][m] * eta * eta + ce[6][m] * zeta * zeta + ce[7][m] * xi * xi * xi + ce[8][m] * eta * eta * eta + ce[9][m] * zeta * zeta * zeta + ce[10][m] * xi * xi * xi * xi + ce[11][m] * eta * eta * eta * eta + ce[12][m] * zeta * zeta * zeta * zeta; //printf("For i = %d, j = %d, k = %d, exact[%d] = %e.\n", i, j, k, m, u000ijk[m]); } } exchange_1.c0000666000175600017620000001276511440470146011427 0ustar sjpsjp// C port of NPB3.2 // subroutine exchange_1(g, k, iex) #include "mpinpb.h" #include "applu.h" #include #include /** * iex = 0 : Receive north/west. * iex = 1 : Receive south/east. * iex = 2 : Send south/east. * iex = 3 : Send north/west. */ void exchange_1(fp_type**** g, int k, int iex){ int i, j, z; MPI_Status status; /** * Receive north/west. */ if (iex == 0) { // Receive from north. if (north != -1) { MPI_Recv(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, north, from_n, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst; j <= jend; j++) { g[k + z][j][1][0] = jbuf[z][j - jst][0]; g[k + z][j][1][1] = jbuf[z][j - jst][1]; g[k + z][j][1][2] = jbuf[z][j - jst][2]; g[k + z][j][1][3] = jbuf[z][j - jst][3]; g[k + z][j][1][4] = jbuf[z][j - jst][4]; } } } } // Receive from west. if (west != -1) { MPI_Recv(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, west, from_w, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist; i <= iend; i++) { g[k + z][1][i][0] = ibuf[z][i - ist][0]; g[k + z][1][i][1] = ibuf[z][i - ist][1]; g[k + z][1][i][2] = ibuf[z][i - ist][2]; g[k + z][1][i][3] = ibuf[z][i - ist][3]; g[k + z][1][i][4] = ibuf[z][i - ist][4]; } } } } /** * Receive south/east. */ } else if (iex == 1) { // Receive from south. if (south != -1) { MPI_Recv(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, south, from_s, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst; j <= jend; j++) { g[k + z - (kblock - 1)][j][nx+2][0] = jbuf[z][j - jst][0]; g[k + z - (kblock - 1)][j][nx+2][1] = jbuf[z][j - jst][1]; g[k + z - (kblock - 1)][j][nx+2][2] = jbuf[z][j - jst][2]; g[k + z - (kblock - 1)][j][nx+2][3] = jbuf[z][j - jst][3]; g[k + z - (kblock - 1)][j][nx+2][4] = jbuf[z][j - jst][4]; } } } } // Receive from east. if (east != -1) { MPI_Recv(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, east, from_e, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist; i <= iend; i++) { g[k + z - (kblock - 1)][ny+2][i][0] = ibuf[z][i - ist][0]; g[k + z - (kblock - 1)][ny+2][i][1] = ibuf[z][i - ist][1]; g[k + z - (kblock - 1)][ny+2][i][2] = ibuf[z][i - ist][2]; g[k + z - (kblock - 1)][ny+2][i][3] = ibuf[z][i - ist][3]; g[k + z - (kblock - 1)][ny+2][i][4] = ibuf[z][i - ist][4]; } } } } /** * Send south/east. */ } else if (iex == 2) { // Send south. if (south != -1) { for (z = 0; z < kblock; z++) { if (k + z - (kblock-1) >= 1 && k + z - (kblock-1) <= nz - 2) { for (j = jst; j <= jend; j++) { jbuf[z][j - jst][0] = g[k + z - (kblock-1)][j][nx+1][0]; jbuf[z][j - jst][1] = g[k + z - (kblock-1)][j][nx+1][1]; jbuf[z][j - jst][2] = g[k + z - (kblock-1)][j][nx+1][2]; jbuf[z][j - jst][3] = g[k + z - (kblock-1)][j][nx+1][3]; jbuf[z][j - jst][4] = g[k + z - (kblock-1)][j][nx+1][4]; } } } // Send south. MPI_Send(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, south, from_n, MPI_COMM_WORLD); } // Send east. if (east != -1) { for (z = 0; z < kblock; z++) { if (k + z - (kblock-1) >= 1 && k + z - (kblock-1) <= nz - 2) { for (i = ist; i <= iend; i++) { ibuf[z][i - ist][0] = g[k + z - (kblock-1)][ny+1][i][0]; ibuf[z][i - ist][1] = g[k + z - (kblock-1)][ny+1][i][1]; ibuf[z][i - ist][2] = g[k + z - (kblock-1)][ny+1][i][2]; ibuf[z][i - ist][3] = g[k + z - (kblock-1)][ny+1][i][3]; ibuf[z][i - ist][4] = g[k + z - (kblock-1)][ny+1][i][4]; } } } MPI_Send(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, east, from_w, MPI_COMM_WORLD); } /** * Send north/west. */ } else { // Send north. if (north != -1) { for (z = 0; z < kblock; z++) { if (k + z >= 1 && k <= nz - 2) { for (j = jst; j <= jend; j++) { jbuf[z][j - jst][0] = g[k + z][j][2][0]; jbuf[z][j - jst][1] = g[k + z][j][2][1]; jbuf[z][j - jst][2] = g[k + z][j][2][2]; jbuf[z][j - jst][3] = g[k + z][j][2][3]; jbuf[z][j - jst][4] = g[k + z][j][2][4]; } } } MPI_Send(jbuf_flat, kblock*(jend-jst+1)*5, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } // Send west. if (west != -1) { for (z = 0; z < kblock; z++) { if (k + z >= 1 && k <= nz - 2) { for (i = ist; i <= iend; i++) { ibuf[z][i - ist][0] = g[k + z][2][i][0]; ibuf[z][i - ist][1] = g[k + z][2][i][1]; ibuf[z][i - ist][2] = g[k + z][2][i][2]; ibuf[z][i - ist][3] = g[k + z][2][i][3]; ibuf[z][i - ist][4] = g[k + z][2][i][4]; } } } MPI_Send(ibuf_flat, kblock*(iend-ist+1)*5, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } } } exchange_3.c0000666000175600017620000001414311440470477011430 0ustar sjpsjp// C port of NPB3.2 // subroutine exchange_3 #include "mpinpb.h" #include "applu.h" /** * Compute the right hand side based on exact solution. */ void exchange_3(fp_type**** g, int iex) { /** * Local variables. */ int i, j, k; int ipos1, ipos2; MPI_Request mid; MPI_Status status; /** * Communicate in the south and north directions. */ if (iex == 0) { if (north != -1) { MPI_Irecv(buf1_flat, 10*ny*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_n, MPI_COMM_WORLD, &mid); } /** * Send south. */ if (south != -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; buf[ipos1][0] = g[k][j][nx][0]; buf[ipos1][1] = g[k][j][nx][1]; buf[ipos1][2] = g[k][j][nx][2]; buf[ipos1][3] = g[k][j][nx][3]; buf[ipos1][4] = g[k][j][nx][4]; buf[ipos2][0] = g[k][j][nx+1][0]; buf[ipos2][1] = g[k][j][nx+1][1]; buf[ipos2][2] = g[k][j][nx+1][2]; buf[ipos2][3] = g[k][j][nx+1][3]; buf[ipos2][4] = g[k][j][nx+1][4]; } } MPI_Send(buf_flat, 10*ny*nz, MPI_FP_TYPE, south, from_n, MPI_COMM_WORLD); } /** * Receive from north. */ if (north != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; g[k][j][0][0] = buf1[ipos1][0]; g[k][j][0][1] = buf1[ipos1][1]; g[k][j][0][2] = buf1[ipos1][2]; g[k][j][0][3] = buf1[ipos1][3]; g[k][j][0][4] = buf1[ipos1][4]; g[k][j][1][0] = buf1[ipos2][0]; g[k][j][1][1] = buf1[ipos2][1]; g[k][j][1][2] = buf1[ipos2][2]; g[k][j][1][3] = buf1[ipos2][3]; g[k][j][1][4] = buf1[ipos2][4]; } } } if (south != -1) { MPI_Irecv(buf1_flat, 10*ny*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &mid); } /** * Send north. */ if (north != -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; buf[ipos1][0] = g[k][j][3][0]; buf[ipos1][1] = g[k][j][3][1]; buf[ipos1][2] = g[k][j][3][2]; buf[ipos1][3] = g[k][j][3][3]; buf[ipos1][4] = g[k][j][3][4]; buf[ipos2][0] = g[k][j][2][0]; buf[ipos2][1] = g[k][j][2][1]; buf[ipos2][2] = g[k][j][2][2]; buf[ipos2][3] = g[k][j][2][3]; buf[ipos2][4] = g[k][j][2][4]; } } MPI_Send(buf_flat, 10*ny*nz, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } /** * Receive from south. */ if (south != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++){ for (j = 2; j <= ny + 1; j++){ ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; g[k][j][nx+3][0] = buf1[ipos1][0]; g[k][j][nx+3][1] = buf1[ipos1][1]; g[k][j][nx+3][2] = buf1[ipos1][2]; g[k][j][nx+3][3] = buf1[ipos1][3]; g[k][j][nx+3][4] = buf1[ipos1][4]; g[k][j][nx+2][0] = buf1[ipos2][0]; g[k][j][nx+2][1] = buf1[ipos2][1]; g[k][j][nx+2][2] = buf1[ipos2][2]; g[k][j][nx+2][3] = buf1[ipos2][3]; g[k][j][nx+2][4] = buf1[ipos2][4]; } } } /** * Communicate in the east and west directions. */ } else { if (west != -1) { MPI_Irecv(buf1_flat, 10*nx*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_w, MPI_COMM_WORLD, &mid); } /** * Send east. */ if (east != -1) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; buf[ipos1][0] = g[k][ny][i][0]; buf[ipos1][1] = g[k][ny][i][1]; buf[ipos1][2] = g[k][ny][i][2]; buf[ipos1][3] = g[k][ny][i][3]; buf[ipos1][4] = g[k][ny][i][4]; buf[ipos2][0] = g[k][ny+1][i][0]; buf[ipos2][1] = g[k][ny+1][i][1]; buf[ipos2][2] = g[k][ny+1][i][2]; buf[ipos2][3] = g[k][ny+1][i][3]; buf[ipos2][4] = g[k][ny+1][i][4]; } } MPI_Send(buf_flat, 10*nx*nz, MPI_FP_TYPE, east, from_w, MPI_COMM_WORLD); } /** * Receive from west. */ if (west != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++){ for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; g[k][0][i][0] = buf1[ipos1][0]; g[k][0][i][1] = buf1[ipos1][1]; g[k][0][i][2] = buf1[ipos1][2]; g[k][0][i][3] = buf1[ipos1][3]; g[k][0][i][4] = buf1[ipos1][4]; g[k][1][i][0] = buf1[ipos2][0]; g[k][1][i][1] = buf1[ipos2][1]; g[k][1][i][2] = buf1[ipos2][2]; g[k][1][i][3] = buf1[ipos2][3]; g[k][1][i][4] = buf1[ipos2][4]; } } } if (east != -1){ MPI_Irecv(buf1_flat, 10*nx*nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &mid); } /** * Send west. */ if (west != -1) { for (k = 0; k <= nz - 1; k++){ for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; buf[ipos1][0] = g[k][3][i][0]; buf[ipos1][1] = g[k][3][i][1]; buf[ipos1][2] = g[k][3][i][2]; buf[ipos1][3] = g[k][3][i][3]; buf[ipos1][4] = g[k][3][i][4]; buf[ipos2][0] = g[k][2][i][0]; buf[ipos2][1] = g[k][2][i][1]; buf[ipos2][2] = g[k][2][i][2]; buf[ipos2][3] = g[k][2][i][3]; buf[ipos2][4] = g[k][2][i][4]; } } MPI_Send(buf_flat, 10*nx*nz, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } /** * Receive from east. */ if (east != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; g[k][ny+3][i][0] = buf1[ipos1][0]; g[k][ny+3][i][1] = buf1[ipos1][1]; g[k][ny+3][i][2] = buf1[ipos1][2]; g[k][ny+3][i][3] = buf1[ipos1][3]; g[k][ny+3][i][4] = buf1[ipos1][4]; g[k][ny+2][i][0] = buf1[ipos2][0]; g[k][ny+2][i][1] = buf1[ipos2][1]; g[k][ny+2][i][2] = buf1[ipos2][2]; g[k][ny+2][i][3] = buf1[ipos2][3]; g[k][ny+2][i][4] = buf1[ipos2][4]; } } } } } exchange_4.c0000666000175600017620000000256211440470575011432 0ustar sjpsjp//subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1) #include "mpinpb.h" #include "applu.h" void exchange_4(fp_type** g, fp_type** h, int ibeg, int ifin1, int jbeg, int jfin1){ /** * Local variables. */ int i, j; int ny2; fp_type dum[1024]; MPI_Request msgid1, msgid3; MPI_Status status; ny2 = ny + 2; /** * Communicate in the east and west directions. */ // Receive from east. if (jfin1 == ny) { MPI_Irecv(dum, 2*nx, MPI_FP_TYPE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &msgid3); MPI_Wait(&msgid3, &status); for (i = 1; i <= nx; i++) { g[ny+1][i] = dum[i-1]; h[ny+1][i] = dum[i+nx-1]; } } // Send west. if (jbeg == 1) { for (i = 1; i <= nx; i++) { dum[i-1] = g[1][i]; dum[i+nx-1] = h[1][i]; } MPI_Send(dum, 2*nx, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } /** * Communicate in the south and north directions. */ // Receive from south. if (ifin1 == nx) { MPI_Irecv(dum, 2*ny2, MPI_FP_TYPE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &msgid1); MPI_Wait(&msgid1, &status); for (j = 0; j <= ny + 1; j++){ g[j][nx+1] = dum[j+1-1]; h[j][nx+1] = dum[j+ny2+1-1]; } } // Send north. if (ibeg == 1) { for (j = 0; j <= ny+1; j++) { dum[j+1-1] = g[j][1]; dum[j+ny2+1-1] = h[j][1]; } MPI_Send(dum, 2*ny2, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } } exchange_5.c0000666000175600017620000000355111440470626011427 0ustar sjpsjp//subroutine exchange_5(g,ibeg,ifin1) /** * compute the right hand side based on exact solution */ /** implicit none include 'mpinpb.h' include 'applu.incl' */ #include "mpinpb.h" #include "applu.h" void exchange_5(fp_type** g, int ibeg, int ifin1){ /** * input parameters */ /* fp_type precision g(0:isiz2+1,0:isiz3+1) integer ibeg, ifin1 */ /** * local variables */ /** integer k fp_type precision dum(1024) integer msgid1 integer status(MPI_status_SIZE) integer IERROR */ int k; fp_type dum[1024]; MPI_Request msgid1; MPI_Status status; /** * communicate in the south and north directions */ /** * receive from south */ /** if (ifin1.eq.nx) then call MPI_IRECV( dum, > nz, > dp_type, > MPI_ANY_SOURCE, > from_s, > MPI_COMM_WORLD, > msgid1, > IERROR ) call MPI_WAIT( msgid1, status, IERROR ) do k = 1,nz g(nx+1,k) = dum(k) end do end if */ if( ifin1 == nx){ MPI_Irecv(dum, nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &msgid1); MPI_Wait(&msgid1, &status); for(k = 1; k <= nz; k++){ g[k][nx+1] =dum[k-1]; } } /** * send north */ /** if (ibeg.eq.1) then do k = 1,nz dum(k) = g(1,k) end do call MPI_SEND( dum, > nz, > dp_type, > north, > from_s, > MPI_COMM_WORLD, > IERROR ) end if */ if(ibeg == 1){ for(k = 1; k <= nz; k++){ dum[k-1] = g[k][1]; } MPI_Send( dum, nz, MPI_FP_TYPE, north, from_s, MPI_COMM_WORLD); } // return // end } exchange_6.c0000666000175600017620000000352411440470175011427 0ustar sjpsjp//subroutine exchange_6(g,jbeg,jfin1) /** * compute the right hand side based on exact solution */ // implicit none // include 'mpinpb.h' // include 'applu.incl' #include "mpinpb.h" #include "applu.h" void exchange_6(fp_type** g, int jbeg, int jfin1){ /** * input parameters */ // fp_type precision g(0:isiz2+1,0:isiz3+1) // integer jbeg, jfin1 /** * local parameters */ /** integer k fp_type precision dum(1024) integer msgid3 integer status(MPI_status_SIZE) integer IERROR */ int k; fp_type dum[1024]; MPI_Request msgid3; MPI_Status status; /** * communicate in the east and west directions */ /** * receive from east */ /** if (jfin1.eq.ny) then call MPI_IRECV( dum, > nz, > dp_type, > MPI_ANY_SOURCE, > from_e, > MPI_COMM_WORLD, > msgid3, > IERROR ) call MPI_WAIT( msgid3, status, IERROR ) do k = 1,nz g(ny+1,k) = dum(k) end do end if */ if( jfin1 == ny){ MPI_Irecv(dum, nz, MPI_FP_TYPE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &msgid3); MPI_Wait(&msgid3, &status); for(k = 1; k <= nz; k++){ g[k][ny+1] =dum[k-1]; } } /* * send west */ /** if (jbeg.eq.1) then do k = 1,nz dum(k) = g(1,k) end do call MPI_SEND( dum, > nz, > dp_type, > west, > from_e, > MPI_COMM_WORLD, > IERROR ) end if */ if(jbeg == 1){ for(k = 1; k <= nz; k++){ dum[k-1] = g[k][1]; } MPI_Send( dum, nz, MPI_FP_TYPE, west, from_e, MPI_COMM_WORLD); } // return // end } init_comm.c0000666000175600017620000000133011353135227011366 0ustar sjpsjp// C port of NPB3.2 // subroutine init-comm #include "applu.h" #include "mpinpb.h" /** * initialize MPI and establish rank and size * * This is a module in the MPI implementation of LUSSOR * pseudo application from the NAS Parallel Benchmarks. */ void init_comm(int argc, char** argv) { // Initialize MPI communication. MPI_Init(&argc, &argv); // Establish the global rank of this process. MPI_Comm_rank(MPI_COMM_WORLD, &id); // Establish the size of the global group. MPI_Comm_size(MPI_COMM_WORLD, &num); /** * num - number of nodes (processors) * nodedim - a function such that it computes the exponent where num = 2 ^ nodedim * i.e. ndim is the square-root of num. */ ndim = nodedim(num); } jacld.c0000646000175600017620000003576411440536131010501 0ustar sjpsjp// C port of NPB3.2 // subroutine jacld(k) #include "applu.h" /** * Compute the lower triangular part of the jacobian matrix. */ void jacld(int starting_k) { /** * Local variables. */ int i, j, k; fp_type r43; fp_type c1345; fp_type c34; fp_type tmp1, tmp2, tmp3; // Constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; r43 = ( fpfour / fpthree ); c1345 = c1 * c3 * c4 * c5; c34 = c3 * c4; for (k = starting_k; k < starting_k + kblock; k++) { int level = k % kblock; for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { // Form the block diagonal. // (ie jacld_d) tmp1 = fpone / u[k][j][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; d[level][j][i][0][0] = fpone + dt * fptwo * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 ); d[level][j][i][1][0] = fpzero; d[level][j][i][2][0] = fpzero; d[level][j][i][3][0] = fpzero; d[level][j][i][4][0] = fpzero; d[level][j][i][0][1] = dt * fptwo * ( tx1 * ( - r43 * c34 * tmp2 * u[k][j][i][1] ) + ty1 * ( - c34 * tmp2 * u[k][j][i][1] ) + tz1 * ( - c34 * tmp2 * u[k][j][i][1] ) ); d[level][j][i][1][1] = fpone + dt * fptwo * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 ); d[level][j][i][2][1] = fpzero; d[level][j][i][3][1] = fpzero; d[level][j][i][4][1] = fpzero; d[level][j][i][0][2] = dt * fptwo * ( tx1 * ( - c34 * tmp2 * u[k][j][i][2] ) + ty1 * ( - r43 * c34 * tmp2 * u[k][j][i][2] ) + tz1 * ( - c34 * tmp2 * u[k][j][i][2] ) ); d[level][j][i][1][2] = fpzero; d[level][j][i][2][2] = fpone + dt * fptwo * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 ); d[level][j][i][3][2] = fpzero; d[level][j][i][4][2] = fpzero; d[level][j][i][0][3] = dt * fptwo * ( tx1 * ( - c34 * tmp2 * u[k][j][i][3] ) + ty1 * ( - c34 * tmp2 * u[k][j][i][3] ) + tz1 * ( - r43 * c34 * tmp2 * u[k][j][i][3] ) ); d[level][j][i][1][3] = fpzero; d[level][j][i][2][3] = fpzero; d[level][j][i][3][3] = fpone + dt * fptwo * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 ); d[level][j][i][4][3] = fpzero; d[level][j][i][0][4] = dt * fptwo * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * ( u[k][j][i][4] ) ) + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * ( u[k][j][i][4] ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * u[k][j][i][4] ) ); d[level][j][i][1][4] = dt * fptwo * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][1] + ty1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][1] + tz1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][1] ); d[level][j][i][2][4] = dt * fptwo * ( tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][2] + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][2] + tz1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][2] ); d[level][j][i][3][4] = dt * fptwo * ( tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][3] + ty1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][3] + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][3] ); d[level][j][i][4][4] = fpone + dt * fptwo * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * fptwo * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 ); // Form the first block sub-diagonal. // (ie jacld_a). tmp1 = fpone / u[k-1][j][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; a[level][j][i][0][0] = -dt * tz1 * dz1; a[level][j][i][1][0] = fpzero; a[level][j][i][2][0] = fpzero; a[level][j][i][3][0] = -dt * tz2; a[level][j][i][4][0] = fpzero; a[level][j][i][0][1] = -dt * tz2 * ( - ( u[k-1][j][i][1] * u[k-1][j][i][3] ) * tmp2 ) -dt * tz1 * ( - c34 * tmp2 * u[k-1][j][i][1] ); a[level][j][i][1][1] = -dt * tz2 * ( u[k-1][j][i][3] * tmp1 ) -dt * tz1 * c34 * tmp1 -dt * tz1 * dz2; a[level][j][i][2][1] = fpzero; a[level][j][i][3][1] = -dt * tz2 * ( u[k-1][j][i][1] * tmp1 ); a[level][j][i][4][1] = fpzero; a[level][j][i][0][2] = -dt * tz2 * ( - ( u[k-1][j][i][2] * u[k-1][j][i][3] ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u[k-1][j][i][2] ); a[level][j][i][1][2] = fpzero; a[level][j][i][2][2] = -dt * tz2 * ( u[k-1][j][i][3] * tmp1 ) -dt * tz1 * ( c34 * tmp1 ) -dt * tz1 * dz3; a[level][j][i][3][2] = -dt * tz2 * ( u[k-1][j][i][2] * tmp1 ); a[level][j][i][4][2] = fpzero; a[level][j][i][0][3] = -dt * tz2 * ( - (( u[k-1][j][i][3] * tmp1 ) * ( u[k-1][j][i][3] * tmp1 )) + fphalf * c2 * ( ( u[k-1][j][i][1] * u[k-1][j][i][1] + u[k-1][j][i][2] * u[k-1][j][i][2] + u[k-1][j][i][3] * u[k-1][j][i][3] ) * tmp2 ) ) -dt * tz1 * ( -r43 * c34 * tmp2 * u[k-1][j][i][3] ); a[level][j][i][1][3] = -dt * tz2 * ( - c2 * ( u[k-1][j][i][1] * tmp1 ) ); a[level][j][i][2][3] = -dt * tz2 * ( - c2 * ( u[k-1][j][i][2] * tmp1 ) ); a[level][j][i][3][3] = -dt * tz2 * ( fptwo - c2 ) * ( u[k-1][j][i][3] * tmp1 ) -dt * tz1 * ( r43 * c34 * tmp1 ) -dt * tz1 * dz4; a[level][j][i][4][3] = -dt * tz2 * c2; a[level][j][i][0][4] = -dt * tz2 * ( ( c2 * ( u[k-1][j][i][1] * u[k-1][j][i][1] + u[k-1][j][i][2] * u[k-1][j][i][2] + u[k-1][j][i][3] * u[k-1][j][i][3] ) * tmp2 - c1 * ( u[k-1][j][i][4] * tmp1 ) ) * ( u[k-1][j][i][3] * tmp1 ) ) -dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k-1][j][i][1] * u[k-1][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k-1][j][i][2] * u[k-1][j][i][2] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k-1][j][i][3] * u[k-1][j][i][3] ) - c1345 * tmp2 * u[k-1][j][i][4] ); a[level][j][i][1][4] = -dt * tz2 * ( - c2 * ( u[k-1][j][i][1] * u[k-1][j][i][3] ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u[k-1][j][i][1]; a[level][j][i][2][4] = -dt * tz2 * ( - c2 * ( u[k-1][j][i][2] * u[k-1][j][i][3] ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u[k-1][j][i][2]; a[level][j][i][3][4] = -dt * tz2 * ( c1 * ( u[k-1][j][i][4] * tmp1 ) - fphalf * c2 * ( ( u[k-1][j][i][1] * u[k-1][j][i][1] + u[k-1][j][i][2] * u[k-1][j][i][2] + fpthree * u[k-1][j][i][3] * u[k-1][j][i][3] ) * tmp2 ) ) -dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u[k-1][j][i][3]; a[level][j][i][4][4] = -dt * tz2 * ( c1 * ( u[k-1][j][i][3] * tmp1 ) ) -dt * tz1 * c1345 * tmp1 -dt * tz1 * dz5; // Form the second block sub-diagonal. // (ie jacld_b) tmp1 = fpone / u[k][j-1][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; b[level][j][i][0][0] = -dt * ty1 * dy1; b[level][j][i][1][0] = fpzero; b[level][j][i][2][0] = -dt * ty2; b[level][j][i][3][0] = fpzero; b[level][j][i][4][0] = fpzero; b[level][j][i][0][1] = -dt * ty2 * ( - ( u[k][j-1][i][1] * u[k][j-1][i][2] ) * tmp2 ) -dt * ty1 * ( - c34 * tmp2 * u[k][j-1][i][1] ); b[level][j][i][1][1] = -dt * ty2 * ( u[k][j-1][i][2] * tmp1 ) -dt * ty1 * ( c34 * tmp1 ) -dt * ty1 * dy2; b[level][j][i][2][1] = -dt * ty2 * ( u[k][j-1][i][1] * tmp1 ); b[level][j][i][3][1] = fpzero; b[level][j][i][4][1] = fpzero; b[level][j][i][0][2] = -dt * ty2 * ( - ( u[k][j-1][i][2] * tmp1 ) * ( u[k][j-1][i][2] * tmp1 ) + fphalf * c2 * ( ( u[k][j-1][i][1] * u[k][j-1][i][1] + u[k][j-1][i][2] * u[k][j-1][i][2] + u[k][j-1][i][3] * u[k][j-1][i][3] ) * tmp2 ) ) -dt * ty1 * ( -r43 * c34 * tmp2 * u[k][j-1][i][2] ); b[level][j][i][1][2] = -dt * ty2 * ( - c2 * ( u[k][j-1][i][1] * tmp1 ) ); b[level][j][i][2][2] = -dt * ty2 * ( ( fptwo - c2 ) * ( u[k][j-1][i][2] * tmp1 ) ) -dt * ty1 * ( r43 * c34 * tmp1 ) -dt * ty1 * dy3; b[level][j][i][3][2] = -dt * ty2 * ( - c2 * ( u[k][j-1][i][3] * tmp1 ) ); b[level][j][i][4][2] = - dt * ty2 * c2; b[level][j][i][0][3] = -dt * ty2 * ( - ( u[k][j-1][i][2] * u[k][j-1][i][3] ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u[k][j-1][i][3] ); b[level][j][i][1][3] = fpzero; b[level][j][i][2][3] = -dt * ty2 * ( u[k][j-1][i][3] * tmp1 ); b[level][j][i][3][3] = -dt * ty2 * ( u[k][j-1][i][2] * tmp1 ) -dt * ty1 * ( c34 * tmp1 ) -dt * ty1 * dy4; b[level][j][i][4][3] = fpzero; b[level][j][i][0][4] = -dt * ty2 * ( ( c2 * ( u[k][j-1][i][1] * u[k][j-1][i][1] + u[k][j-1][i][2] * u[k][j-1][i][2] + u[k][j-1][i][3] * u[k][j-1][i][3] ) * tmp2 - c1 * ( u[k][j-1][i][4] * tmp1 ) ) * ( u[k][j-1][i][2] * tmp1 ) ) -dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u[k][j-1][i][1] * u[k][j-1][i][1] - ( r43 * c34 - c1345 ) * tmp3 * u[k][j-1][i][2] * u[k][j-1][i][2] - ( c34 - c1345 ) * tmp3 * u[k][j-1][i][3] * u[k][j-1][i][3] - c1345 * tmp2 * u[k][j-1][i][4] ); b[level][j][i][1][4] = -dt * ty2 * ( - c2 * ( u[k][j-1][i][1] * u[k][j-1][i][2] ) * tmp2 ) -dt * ty1 * ( c34 - c1345 ) * tmp2 * u[k][j-1][i][1]; b[level][j][i][2][4] = -dt * ty2 * ( c1 * ( u[k][j-1][i][4] * tmp1 ) - fphalf * c2 * ( ( u[k][j-1][i][1] * u[k][j-1][i][1] + fpthree * u[k][j-1][i][2] * u[k][j-1][i][2] + u[k][j-1][i][3] * u[k][j-1][i][3] ) * tmp2 ) ) -dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j-1][i][2]; b[level][j][i][3][4] = -dt * ty2 * ( - c2 * ( u[k][j-1][i][2] * u[k][j-1][i][3] ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u[k][j-1][i][3]; b[level][j][i][4][4] = -dt * ty2 * ( c1 * ( u[k][j-1][i][2] * tmp1 ) ) -dt * ty1 * c1345 * tmp1 -dt * ty1 * dy5; // Form the third block sub-diagonal. // (ie jacld_c) tmp1 = fpone / u[k][j][i-1][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; c[level][j][i][0][0] = -dt * tx1 * dx1; c[level][j][i][1][0] = -dt * tx2; c[level][j][i][2][0] = fpzero; c[level][j][i][3][0] = fpzero; c[level][j][i][4][0] = fpzero; c[level][j][i][0][1] = -dt * tx2 * ( - ( u[k][j][i-1][1] * tmp1 ) * ( u[k][j][i-1][1] * tmp1 ) + c2 * fphalf * ( u[k][j][i-1][1] * u[k][j][i-1][1] + u[k][j][i-1][2] * u[k][j][i-1][2] + u[k][j][i-1][3] * u[k][j][i-1][3] ) * tmp2 ) -dt * tx1 * ( -r43 * c34 * tmp2 * u[k][j][i-1][1] ); c[level][j][i][1][1] = -dt * tx2 * ( ( fptwo - c2 ) * ( u[k][j][i-1][1] * tmp1 ) ) -dt * tx1 * ( r43 * c34 * tmp1 ) -dt * tx1 * dx2; c[level][j][i][2][1] = -dt * tx2 * ( -c2 * ( u[k][j][i-1][2] * tmp1 ) ); c[level][j][i][3][1] = -dt * tx2 * ( -c2 * ( u[k][j][i-1][3] * tmp1 ) ); c[level][j][i][4][1] = -dt * tx2 * c2; c[level][j][i][0][2] = -dt * tx2 * ( - ( u[k][j][i-1][1] * u[k][j][i-1][2] ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u[k][j][i-1][2] ); c[level][j][i][1][2] = -dt * tx2 * ( u[k][j][i-1][2] * tmp1 ); c[level][j][i][2][2] = -dt * tx2 * ( u[k][j][i-1][1] * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3; c[level][j][i][3][2] = fpzero; c[level][j][i][4][2] = fpzero; c[level][j][i][0][3] = -dt * tx2 * ( - ( u[k][j][i-1][1] * u[k][j][i-1][3] ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u[k][j][i-1][3] ); c[level][j][i][1][3] = -dt * tx2 * ( u[k][j][i-1][3] * tmp1 ); c[level][j][i][2][3] = fpzero; c[level][j][i][3][3] = -dt * tx2 * ( u[k][j][i-1][1] * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4; c[level][j][i][4][3] = fpzero; c[level][j][i][0][4] = -dt * tx2 * ( ( c2 * ( u[k][j][i-1][1] * u[k][j][i-1][1] + u[k][j][i-1][2] * u[k][j][i-1][2] + u[k][j][i-1][3] * u[k][j][i-1][3] ) * tmp2 - c1 * ( u[k][j][i-1][4] * tmp1 ) ) * ( u[k][j][i-1][1] * tmp1 ) ) -dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i-1][1] * u[k][j][i-1][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i-1][2] * u[k][j][i-1][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i-1][3] * u[k][j][i-1][3] ) - c1345 * tmp2 * u[k][j][i-1][4] ); c[level][j][i][1][4] = -dt * tx2 * ( c1 * ( u[k][j][i-1][4] * tmp1 ) - fphalf * c2 * ( ( fpthree * u[k][j][i-1][1] * u[k][j][i-1][1] + u[k][j][i-1][2] * u[k][j][i-1][2] + u[k][j][i-1][3] * u[k][j][i-1][3] ) * tmp2 ) ) -dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i-1][1]; c[level][j][i][2][4] = -dt * tx2 * ( - c2 * ( u[k][j][i-1][2] * u[k][j][i-1][1] ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i-1][2]; c[level][j][i][3][4] = -dt * tx2 * ( - c2 * ( u[k][j][i-1][3] * u[k][j][i-1][1] ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i-1][3]; c[level][j][i][4][4] = -dt * tx2 * ( c1 * ( u[k][j][i-1][1] * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5; } } } } } jacu.c0000646000175600017620000003574111440536300010337 0ustar sjpsjp// C port of NPB3.2 // subroutine jacu(k) #include "applu.h" /** * Compute the upper triangular part of the jacobian matrix. */ void jacu(int starting_k) { /** * Local variables. */ int i, j, k; fp_type r43; fp_type c1345; fp_type c34; fp_type tmp1, tmp2, tmp3; // Constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; r43 = ( fpfour / fpthree ); c1345 = c1 * c3 * c4 * c5; c34 = c3 * c4; for (k = starting_k; k > starting_k - kblock; k--) { int level = k % kblock; for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { // Form the block diagonal. // (ie jacu_d). tmp1 = fpone / u[k][j][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; d[level][j][i][0][0] = fpone + dt * fptwo * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 ); d[level][j][i][1][0] = fpzero; d[level][j][i][2][0] = fpzero; d[level][j][i][3][0] = fpzero; d[level][j][i][4][0] = fpzero; d[level][j][i][0][1] = dt * fptwo * ( tx1 * ( - r43 * c34 * tmp2 * u[k][j][i][1] ) + ty1 * ( - c34 * tmp2 * u[k][j][i][1] ) + tz1 * ( - c34 * tmp2 * u[k][j][i][1] ) ); d[level][j][i][1][1] = fpone + dt * fptwo * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 ); d[level][j][i][2][1] = fpzero; d[level][j][i][3][1] = fpzero; d[level][j][i][4][1] = fpzero; d[level][j][i][0][2] = dt * fptwo * ( tx1 * ( - c34 * tmp2 * u[k][j][i][2] ) + ty1 * ( - r43 * c34 * tmp2 * u[k][j][i][2] ) + tz1 * ( - c34 * tmp2 * u[k][j][i][2] ) ); d[level][j][i][1][2] = fpzero; d[level][j][i][2][2] = fpone + dt * fptwo * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 ); d[level][j][i][3][2] = fpzero; d[level][j][i][4][2] = fpzero; d[level][j][i][0][3] = dt * fptwo * ( tx1 * ( - c34 * tmp2 * u[k][j][i][3] ) + ty1 * ( - c34 * tmp2 * u[k][j][i][3] ) + tz1 * ( - r43 * c34 * tmp2 * u[k][j][i][3] ) ); d[level][j][i][1][3] = fpzero; d[level][j][i][2][3] = fpzero; d[level][j][i][3][3] = fpone + dt * fptwo * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * fptwo * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 ); d[level][j][i][4][3] = fpzero; d[level][j][i][0][4] = dt * fptwo * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * ( u[k][j][i][4] ) ) + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * ( u[k][j][i][4] ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][1] * u[k][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i][2] * u[k][j][i][2] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i][3] * u[k][j][i][3] ) - ( c1345 ) * tmp2 * u[k][j][i][4] ) ); d[level][j][i][1][4] = dt * fptwo * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][1] + ty1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][1] + tz1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][1] ); d[level][j][i][2][4] = dt * fptwo * ( tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][2] + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][2] + tz1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][2] ); d[level][j][i][3][4] = dt * fptwo * ( tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][3] + ty1 * ( c34 - c1345 ) * tmp2 * u[k][j][i][3] + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i][3] ); d[level][j][i][4][4] = fpone + dt * fptwo * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * fptwo * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 ); // Form the first block sub-diagonal. // (ie jacu_a). tmp1 = fpone / u[k][j][i+1][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; a[level][j][i][0][0] = -dt * tx1 * dx1; a[level][j][i][1][0] = dt * tx2; a[level][j][i][2][0] = fpzero; a[level][j][i][3][0] = fpzero; a[level][j][i][4][0] = fpzero; a[level][j][i][0][1] = dt * tx2 * ( - ( u[k][j][i+1][1] * tmp1 ) * ( u[k][j][i+1][1] * tmp1) + c2 * fphalf * ( u[k][j][i+1][1] * u[k][j][i+1][1] + u[k][j][i+1][2] * u[k][j][i+1][2] + u[k][j][i+1][3] * u[k][j][i+1][3] ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u[k][j][i+1][1] ); a[level][j][i][1][1] = dt * tx2 * ( ( fptwo - c2 ) * ( u[k][j][i+1][1] * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2; a[level][j][i][2][1] = dt * tx2 * ( - c2 * ( u[k][j][i+1][2] * tmp1 ) ); a[level][j][i][3][1] = dt * tx2 * ( - c2 * ( u[k][j][i+1][3] * tmp1 ) ); a[level][j][i][4][1] = dt * tx2 * c2; a[level][j][i][0][2] = dt * tx2 * ( - ( u[k][j][i+1][1] * u[k][j][i+1][2] ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u[k][j][i+1][2] ); a[level][j][i][1][2] = dt * tx2 * ( u[k][j][i+1][2] * tmp1 ); a[level][j][i][2][2] = dt * tx2 * ( u[k][j][i+1][1] * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3; a[level][j][i][3][2] = fpzero; a[level][j][i][4][2] = fpzero; a[level][j][i][0][3] = dt * tx2 * ( - ( u[k][j][i+1][1] * u[k][j][i+1][3] ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u[k][j][i+1][3] ); a[level][j][i][1][3] = dt * tx2 * ( u[k][j][i+1][3] * tmp1 ); a[level][j][i][2][3] = fpzero; a[level][j][i][3][3] = dt * tx2 * ( u[k][j][i+1][1] * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4; a[level][j][i][4][3] = fpzero; a[level][j][i][0][4] = dt * tx2 * ( ( c2 * ( u[k][j][i+1][1] * u[k][j][i+1][1] + u[k][j][i+1][2] * u[k][j][i+1][2] + u[k][j][i+1][3] * u[k][j][i+1][3] ) * tmp2 - c1 * ( u[k][j][i+1][4] * tmp1 ) ) * ( u[k][j][i+1][1] * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j][i+1][1] * u[k][j][i+1][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i+1][2] * u[k][j][i+1][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j][i+1][3] * u[k][j][i+1][3] ) - c1345 * tmp2 * u[k][j][i+1][4] ); a[level][j][i][1][4] = dt * tx2 * ( c1 * ( u[k][j][i+1][4] * tmp1 ) - fphalf * c2 * ( ( fpthree * u[k][j][i+1][1] * u[k][j][i+1][1] + u[k][j][i+1][2] * u[k][j][i+1][2] + u[k][j][i+1][3] * u[k][j][i+1][3] ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j][i+1][1]; a[level][j][i][2][4] = dt * tx2 * ( - c2 * ( u[k][j][i+1][2] * u[k][j][i+1][1] ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i+1][2]; a[level][j][i][3][4] = dt * tx2 * ( - c2 * ( u[k][j][i+1][3] * u[k][j][i+1][1] ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u[k][j][i+1][3]; a[level][j][i][4][4] = dt * tx2 * ( c1 * ( u[k][j][i+1][1] * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5; // Form the second block sub-diagonal. tmp1 = fpone / u[k][j+1][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; b[level][j][i][0][0] = -dt * ty1 * dy1; b[level][j][i][1][0] = fpzero; b[level][j][i][2][0] = dt * ty2; b[level][j][i][3][0] = fpzero; b[level][j][i][4][0] = fpzero; b[level][j][i][0][1] = dt * ty2 * ( - ( u[k][j+1][i][1] * u[k][j+1][i][2] ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u[k][j+1][i][1] ); b[level][j][i][1][1] = dt * ty2 * ( u[k][j+1][i][2] * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2; b[level][j][i][2][1] = dt * ty2 * ( u[k][j+1][i][1] * tmp1 ); b[level][j][i][3][1] = fpzero; b[level][j][i][4][1] = fpzero; b[level][j][i][0][2] = dt * ty2 * ( - ( u[k][j+1][i][2] * tmp1 ) * ( u[k][j+1][i][2] * tmp1 ) + fphalf * c2 * ( ( u[k][j+1][i][1] * u[k][j+1][i][1] + u[k][j+1][i][2] * u[k][j+1][i][2] + u[k][j+1][i][3] * u[k][j+1][i][3] ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u[k][j+1][i][2] ); b[level][j][i][1][2] = dt * ty2 * ( - c2 * ( u[k][j+1][i][1] * tmp1 ) ); b[level][j][i][2][2] = dt * ty2 * ( ( fptwo - c2 ) * ( u[k][j+1][i][2] * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3; b[level][j][i][3][2] = dt * ty2 * ( - c2 * ( u[k][j+1][i][3] * tmp1 ) ); b[level][j][i][4][2] = dt * ty2 * c2; b[level][j][i][0][3] = dt * ty2 * ( - ( u[k][j+1][i][2] * u[k][j+1][i][3] ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u[k][j+1][i][3] ); b[level][j][i][1][3] = fpzero; b[level][j][i][2][3] = dt * ty2 * ( u[k][j+1][i][3] * tmp1 ); b[level][j][i][3][3] = dt * ty2 * ( u[k][j+1][i][2] * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4; b[level][j][i][4][3] = fpzero; b[level][j][i][0][4] = dt * ty2 * ( ( c2 * ( u[k][j+1][i][1] * u[k][j+1][i][1] + u[k][j+1][i][2] * u[k][j+1][i][2] + u[k][j+1][i][3] * u[k][j+1][i][3] ) * tmp2 - c1 * ( u[k][j+1][i][4] * tmp1 ) ) * ( u[k][j+1][i][2] * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k][j+1][i][1] * u[k][j+1][i][1] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k][j+1][i][2] * u[k][j+1][i][2] ) - ( c34 - c1345 ) * tmp3 * ( u[k][j+1][i][3] * u[k][j+1][i][3] ) - c1345 * tmp2 * u[k][j+1][i][4] ); b[level][j][i][1][4] = dt * ty2 * ( - c2 * ( u[k][j+1][i][1] * u[k][j+1][i][2] ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u[k][j+1][i][1]; b[level][j][i][2][4] = dt * ty2 * ( c1 * ( u[k][j+1][i][4] * tmp1 ) - fphalf * c2 * ( ( u[k][j+1][i][1] * u[k][j+1][i][1] + fpthree * u[k][j+1][i][2] * u[k][j+1][i][2] + u[k][j+1][i][3] * u[k][j+1][i][3] ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u[k][j+1][i][2]; b[level][j][i][3][4] = dt * ty2 * ( - c2 * ( u[k][j+1][i][2] * u[k][j+1][i][3] ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u[k][j+1][i][3]; b[level][j][i][4][4] = dt * ty2 * ( c1 * ( u[k][j+1][i][2] * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5; // Form the third block sub-diagonal. //(ie jacu_c). tmp1 = fpone / u[k+1][j][i][0]; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; c[level][j][i][0][0] = -dt * tz1 * dz1; c[level][j][i][1][0] = fpzero; c[level][j][i][2][0] = fpzero; c[level][j][i][3][0] = dt * tz2; c[level][j][i][4][0] = fpzero; c[level][j][i][0][1] = dt * tz2 * ( - ( u[k+1][j][i][1] * u[k+1][j][i][3] ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u[k+1][j][i][1] ); c[level][j][i][1][1] = dt * tz2 * ( u[k+1][j][i][3] * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2; c[level][j][i][2][1] = fpzero; c[level][j][i][3][1] = dt * tz2 * ( u[k+1][j][i][1] * tmp1 ); c[level][j][i][4][1] = fpzero; c[level][j][i][0][2] = dt * tz2 * ( - ( u[k+1][j][i][2] * u[k+1][j][i][3] ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u[k+1][j][i][2] ); c[level][j][i][1][2] = fpzero; c[level][j][i][2][2] = dt * tz2 * ( u[k+1][j][i][3] * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3; c[level][j][i][3][2] = dt * tz2 * ( u[k+1][j][i][2] * tmp1 ); c[level][j][i][4][2] = fpzero; c[level][j][i][0][3] = dt * tz2 * ( - ( u[k+1][j][i][3] * tmp1 ) * ( u[k+1][j][i][3] * tmp1 ) + fphalf * c2 * ( ( u[k+1][j][i][1] * u[k+1][j][i][1] + u[k+1][j][i][2] * u[k+1][j][i][2] + u[k+1][j][i][3] * u[k+1][j][i][3] ) * tmp2 ) ) -dt * tz1 * ( - r43 * c34 * tmp2 * u[k+1][j][i][3] ); c[level][j][i][1][3] = dt * tz2 * ( - c2 * ( u[k+1][j][i][1] * tmp1 ) ); c[level][j][i][2][3] = dt * tz2 * ( - c2 * ( u[k+1][j][i][2] * tmp1 ) ); c[level][j][i][3][3] = dt * tz2 * ( fptwo - c2 ) * ( u[k+1][j][i][3] * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4; c[level][j][i][4][3] = dt * tz2 * c2; c[level][j][i][0][4] = dt * tz2 * ( ( c2 * ( u[k+1][j][i][1] * u[k+1][j][i][1] + u[k+1][j][i][2] * u[k+1][j][i][2] + u[k+1][j][i][3] * u[k+1][j][i][3] ) * tmp2 - c1 * ( u[k+1][j][i][4] * tmp1 ) ) * ( u[k+1][j][i][3] * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u[k+1][j][i][1] * u[k+1][j][i][1] ) - ( c34 - c1345 ) * tmp3 * ( u[k+1][j][i][2] * u[k+1][j][i][2] ) - ( r43 * c34 - c1345 ) * tmp3 * ( u[k+1][j][i][3] * u[k+1][j][i][3] ) - c1345 * tmp2 * u[k+1][j][i][4] ); c[level][j][i][1][4] = dt * tz2 * ( - c2 * ( u[k+1][j][i][1] * u[k+1][j][i][3] ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u[k+1][j][i][1]; c[level][j][i][2][4] = dt * tz2 * ( - c2 * ( u[k+1][j][i][2] * u[k+1][j][i][3] ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u[k+1][j][i][2]; c[level][j][i][3][4] = dt * tz2 * ( c1 * ( u[k+1][j][i][4] * tmp1 ) - fphalf * c2 * ( ( u[k+1][j][i][1] * u[k+1][j][i][1] + u[k+1][j][i][2] * u[k+1][j][i][2] + fpthree * u[k+1][j][i][3] * u[k+1][j][i][3] ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u[k+1][j][i][3]; c[level][j][i][4][4] = dt * tz2 * ( c1 * ( u[k+1][j][i][3] * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5; } } } } } l2norm.c0000646000175600017620000000164411477420570010634 0ustar sjpsjp// C port of NPB3.2 // subroutine l2norm #include "size.h" #include "applu.h" #include "mpinpb.h" #include "timers.h" #include void l2norm (int ldx, int ldy, int ldz, int nx0, int ny0, int nz0, int ist, int iend, int jst, int jend, fp_type**** v, fp_type sum[5]) { /** * Local variables. */ int i, j, k, m; const fp_type fpzero = 0.0e+00; timer_start(7); for (m = 0; m < 5; m++) { sum[m] = fpzero; } for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { sum[m] = sum[m] + v[k][j][i][m] * v[k][j][i][m]; } } } } /** * Compute the global sum of individual contributions to dot product. */ MPI_Allreduce(MPI_IN_PLACE, sum, 5, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); for (m = 0; m < 5; m++) { sum[m] = sqrt ( sum[m] / ( (nx0-2)*(ny0-2)*(nz0-2) ) ); } timer_stop(7); } main.c0000646000175600017620000002150111477730651010345 0ustar sjpsjp/**** * * * N A S P A R A L L E L B E N C H M A R K S 3.2 * * * * L U * * * * * * This benchmark is part of the NAS Parallel Benchmark 3.2 suite. * * It is described in NAS Technical Reports 95-020 and 02-007 * * * * Permission to use, copy, distribute and modify this software * * for any purpose with or without fee is hereby granted. We * * request, however, that all derived work reference the NAS * * Parallel Benchmarks 3.2. This software is provided "as is" * * without express or implied warranty. * * * * Information on NPB 3.2, including the technical report, the * * original specifications, source code, results and information * * on how to submit new results, is available at: * * * * http://www.nas.nasa.gov/Software/NPB/ * * * * Send comments or suggestions to npb@nas.nasa.gov * * * * NAS Parallel Benchmarks Group * * NASA Ames Research Center * * Mail Stop: T27A-1 * * Moffett Field, CA 94035-1000 * * * * E-mail: npb@nas.nasa.gov * * Fax: (650) 604-3957 * * * * *** * * * Authors: S. Weeratunga * * V. Venkatakrishnan * * E. Barszcz * * M. Yarrow * * * * *** * *** * * * C Port: S.J. Pennycook * * O.J Perks * * * * **/ #include #include #include #include "applu.h" #include "globals.h" #include "alloc.h" #include "mpinpb.h" #include "timers.h" #include "util.h" /** * Driver for the performance evaluation of the solver for * five coupled parabolic/elliptic partial differential equations. */ int main(int argc, char* argv[]) { // Initialize communications. init_comm(argc, argv); timer_init(12); // Read if we are GPU or CPU. if (argc != 2) { printf("Usage: ./applu [--cpu or --gpu]\n"); return -1; } #include "alloc.h" int running_on_cpu = 0; if (strcmp(argv[1], "--cpu") == 0) { running_on_cpu = 1; } else if (strcmp(argv[1], "--gpu") == 0) { running_on_cpu = 0; } else { printf("Usage: ./applu [--cpu or --gpu]\n"); return -1; } // Allocate memory for our arrays. u_flat = malloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type) ); rsd_flat = malloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type) ); frct_flat = malloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type) ); flux_flat = malloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(fp_type) ); if (u_flat == NULL || rsd_flat == NULL || frct_flat == NULL || flux_flat == NULL) { printf("alloc_ursd() failed.\n"); exit(EXIT_FAILURE); } u = alloc_ursd(u_flat); rsd = alloc_ursd(rsd_flat); frct = alloc_ursd(frct_flat); flux = alloc_ursd(flux_flat); // Allocate memory for the a/b/c/d arrays if we are on the CPU. if (running_on_cpu) { a_flat = malloc( kblock * (isiz2 + 4) * (isiz1 + 4) * 5 * 5 * sizeof(fp_type) ); b_flat = malloc( kblock * (isiz2 + 4) * (isiz1 + 4) * 5 * 5 * sizeof(fp_type) ); c_flat = malloc( kblock * (isiz2 + 4) * (isiz1 + 4) * 5 * 5 * sizeof(fp_type) ); d_flat = malloc( kblock * (isiz2 + 4) * (isiz1 + 4) * 5 * 5 * sizeof(fp_type) ); if (a_flat == NULL || b_flat == NULL || c_flat == NULL || d_flat == NULL) { printf("alloc_abcd() failed.\n"); exit(EXIT_FAILURE); } a = alloc_abcd(a_flat); b = alloc_abcd(b_flat); c = alloc_abcd(c_flat); d = alloc_abcd(d_flat); } if (id == 0) { printf("\n"); printf("===========================================\n"); } // Read input data. read_input(); // Set up processor grid. proc_grid(); // Determine the neighbors. neighbors(); // Set up sub-domain sizes - understand more later for per processor work. subdomain(); if (id == 0) { printf("===========================================\n"); } // Allocate memory for the buffers, based on this subdomain size. buf_flat = malloc( 10 * isiz3 * isiz2 * sizeof(fp_type) ); buf1_flat = malloc( 10 * isiz3 * isiz2 * sizeof(fp_type) ); if (buf_flat == NULL || buf1_flat == NULL) { printf("alloc_buffer() failed.\n"); exit(EXIT_FAILURE); } buf = alloc_buffer(buf_flat); buf1 = alloc_buffer(buf1_flat); if (buf == NULL || buf1 == NULL) { printf("alloc_buffer() failed.\n"); exit(EXIT_FAILURE); } // Allocate memory for MPI buffers. ibuf_flat = malloc( kblock * (iend - ist + 1) * 5 * sizeof(fp_type) ); jbuf_flat = malloc( kblock * (jend - jst + 1) * 5 * sizeof(fp_type) ); if (ibuf_flat == NULL || jbuf_flat == NULL) { printf("alloc_ibuf/jbuf() failed.\n"); exit(EXIT_FAILURE); } ibuf = alloc_ibuf(ibuf_flat); jbuf = alloc_jbuf(jbuf_flat); if (ibuf == NULL || jbuf == NULL) { printf("alloc_ibuf/jbuf() failed.\n"); exit(EXIT_FAILURE); } // Set up coefficients. setcoeff(); // Set the masks required for comm. sethyper(); // Set the boundary values for dependent variables. setbv(); // Set the initial values for dependent variables. setiv(); // Compute the forcing term based on prescribed exact solution. erhs(); // Perform one SSOR iteration to touch all data and program pages. if (running_on_cpu) { ssor(1); } else { ssor_gpu(1); } // Reset the boundary and initial values. setbv(); setiv(); // Perform the SSOR iterations - timed. if (running_on_cpu) { ssor(itmax); } else { ssor_gpu(itmax); } // Compute the solution error. error(); // Compute the surface integral. pintgr(); // Verification test. if (id == 0) { verified = verify( rsdnm, errnm, frc, &class ); mflops = ((float) itmax) * (1984.77 * ((float) nx0) * ((float) ny0) * ((float) nz0) -10923.3 * ((float) (nx0 + ny0 + nz0 ) / 3.0) * ((float) (nx0 + ny0 + nz0 ) / 3.0) +27770.9 * ((float) (nx0 + ny0 + nz0 ) / 3.0) -144010.0) / (maxtime * 1000000.0); printf(" itmax = %d\n", itmax); printf(" x = %d\n", nx0); printf(" y = %d\n", ny0); printf(" z = %d\n", nz0); printf("\n"); } // Print timing results. int p; for (p = 0; p < num; p++) { if (id == p) { printf("\n"); printf(" Rank = %d\n", id); print_timers(); } MPI_Barrier(MPI_COMM_WORLD); } // Free memory for our arrays. free(u_flat); free(rsd_flat); free(frct_flat); free(flux_flat); free_ursd(u); free_ursd(rsd); free_ursd(frct); free_ursd(flux); if (running_on_cpu) { free(a_flat); free(b_flat); free(c_flat); free(d_flat); free_abcd(a); free_abcd(b); free_abcd(c); free_abcd(d); } free(buf_flat); free(buf1_flat); free_buffer(buf); free_buffer(buf1); free(ibuf_flat); free(jbuf_flat); free_ibuf(ibuf); free_jbuf(jbuf); timer_finalize(); MPI_Finalize(); return 0; } Makefile0000777000175600017620000000602711544626524010726 0ustar sjpsjp# Default to single processor. NPROCS ?= 1 # Default to class S. CLASS ?= S # These are available in case of kernel changes; # current kernels assume particular block sizes. GPUBLOCKY = 1 GPUBLOCKZ = 1 GPUGRIDY = 1 RHSBLOCKZ = 1 # Choose other default values based on class. ifeq ($(CLASS), S) KBLOCK ?= 1 GPUBLOCKX ?= 1 GPUGRIDX ?= 1 GPUGRIDY ?= 1 RHSBLOCKX ?= 4 RHSBLOCKY ?= 4 RHSGRIDX ?= 4 RHSGRIDY ?= 4 else ifeq ($(CLASS), A) KBLOCK ?= 64 GPUBLOCKX ?= 64 GPUGRIDX ?= 73 RHSBLOCKX ?= 8 RHSBLOCKY ?= 8 RHSGRIDX ?= 9 RHSGRIDY ?= 9 else ifeq ($(CLASS), B) KBLOCK ?= 102 GPUBLOCKX ?= 64 GPUGRIDX ?= 176 RHSBLOCKX ?= 8 RHSBLOCKY ?= 8 RHSGRIDX ?= 14 RHSGRIDY ?= 14 else ifeq ($(CLASS), C) KBLOCK ?= 162 GPUBLOCKX ?= 64 GPUGRIDX ?= 431 RHSBLOCKX ?= 8 RHSBLOCKY ?= 8 RHSGRIDX ?= 21 RHSGRIDY ?= 21 endif DEBUG ?= 0 ARCH ?= sm_20 # Default to double precision. PRECISION ?= 2 ifeq ($(PRECISION),1) PREC = SP PRE_PROC += -D SINGLE_PRECISION_LU PRECISION_FLAGS = #-fsingle-precision-constant else ifeq ($(PRECISION),2) PREC = DP PRE_PROC += -D DOUBLE_PRECISION_LU endif # Define compilers. CC = mpicc NVCC = nvcc # Reference to MPICC compiler. CCPATH=`which mpicc` # Compiler flags. CFLAGS = -O2 -funroll-loops -msse3 -I ./headers $(PRE_PROC) -Wall NVCCFLAGS = -ccbin=$(CCPATH) --compiler-options "$(CFLAGS)" -O2 -I $(NVIDIA_CUDA_SDK)/common/inc -I . -I ./headers -I ./cuda/ -I ./cuda/kernels -Xptxas -v -arch="$(ARCH)" $(PRE_PROC) LDFLAGS = -lcuda -lcudart -lcutil_x86_64 # Paths to library files. CUDA_DIR = /opt/cuda/toolkit/4.0.11/cuda/lib64/ SDK_DIR = /opt/cuda/sdk/4.0.11/C/lib/ # Check for DEBUG ifeq ($(DEBUG),1) CFLAGS += -g LDFLAGS += -g NVCCFLAGS += -g -G endif # Define executable name. EXE ?= applu.$(PREC).$(CLASS).$(NPROCS).$(KBLOCK) OBJS = main.o \ bcast_inputs.o \ blts.o \ buts.o \ erhs.o \ error.o \ exact.o \ exchange_1.o \ exchange_3.o \ exchange_4.o \ exchange_5.o \ exchange_6.o \ init_comm.o \ jacld.o \ jacu.o \ l2norm.o \ neighbors.o \ nodedim.o \ pintgr.o \ proc_grid.o \ read_input.o \ rhs.o \ setbv.o \ setcoeff.o \ sethyper.o \ setiv.o \ ssor.o \ ssor_cuda.o \ subdomain.o \ verify.o \ alloc.o \ util.o \ timers.o all: clean setparams applu applu: $(OBJS) #$(CC) $(CFLAGS) $(PRECISION_FLAGS) -L /opt/cuda/toolkit/4.0.11/cuda/lib64/ -L /opt/cuda/sdk/4.0.11/C/lib -o ./bin/$(EXE) $(OBJS) -lm $(LDFLAGS) $(CC) $(CFLAGS) $(PRECISION_FLAGS) -L $(CUDA_DIR) -L $(SDK_DIR) -o ./bin/$(EXE) $(OBJS) -lm $(LDFLAGS) ssor_cuda.o: $(NVCC) $(NVCCFLAGS) -c ./cuda/ssor_cuda.cu .c.o: $*.c applu.h $(CC) $(CFLAGS) $(PRECISION_FLAGS) -c $*.c setparams: $(CC) $(CFLAGS) -o setparams setparams.c -lm ./setparams $(NPROCS) $(CLASS) $(KBLOCK) $(GPUBLOCKX) $(GPUBLOCKY) $(GPUBLOCKZ) $(GPUGRIDX) $(GPUGRIDY) $(RHSBLOCKX) $(RHSBLOCKY) $(RHSBLOCKZ) $(RHSGRIDX) $(RHSGRIDY) cp size.h ./headers/size.h clean: rm -f *.o core gmon.out setparams *~ rm -f size.h ./headers/size.h neighbors.c0000666000175600017620000000200111353135231011357 0ustar sjpsjp // subroutine neighbors () // implicit none // include 'applu.incl' #include "applu.h" void neighbors(){ /** * figure out the neighbors and their wrap numbers for each processor */ south = -1; east = -1; north = -1; west = -1; /** if (row.gt.1) then north = id -1 else north = -1 end if */ if( row > 1 ){ north = id - 1; }else{ north = -1; } /** if (row.lt.xdim) then south = id + 1 else south = -1 end if */ if( row < xdim){ south = id + 1; }else{ south =-1; } /** if (col.gt.1) then west = id- xdim else west = -1 end if */ if( col > 1){ west = id - xdim; }else{ west = -1; } /** if (col.lt.ydim) then east = id + xdim else east = -1 end if */ if( col < ydim){ east = id + xdim; }else{ east =-1; } // return // end } nodedim.c0000666000175600017620000000077611440472665011053 0ustar sjpsjp#include "applu.h" #include // integer function nodedim(num) int nodedim(int num){ /** * compute the exponent where num = 2**nodedim * NOTE: assumes a power-of-two number of nodes */ // implicit none /** * input parameters */ // integer num /** * local variables */ //fp_type precision fnum fp_type fnum; //fnum = dble(num) fnum = (fp_type) num; //nodedim = log(fnum)/log(2.0d+0) + 0.00001 return log(fnum)/log((fp_type)2.0) + 0.00001; // return // end } pintgr.c0000646000175600017620000001454611544626004010726 0ustar sjpsjp// C port of NPB3.2 // subroutine pintgr #include "applu.h" #include "mpinpb.h" #include #include /** * Set up the sub-domains for integration in each processor. */ void pintgr() { /** * Local variables. */ int i, j, k; int ibeg, ifin, ifin1; int jbeg, jfin, jfin1; int iglob, iglob1, iglob2; int jglob, jglob1, jglob2; int ind1, ind2; fp_type **phi1, **phi2; fp_type frc1, frc2, frc3; fp_type dummy; // Constants. const fp_type c2 = c2_def; const fp_type fpzero = 0.0e+00; const fp_type fpquarter = 0.25e+00; const fp_type fphalf = 0.50e+00; // Allocate for phi1 and phi2. phi1 = malloc( (isiz3 + 2) * sizeof(fp_type*) ); phi2 = malloc( (isiz3 + 2) * sizeof(fp_type*) ); // Allocate arrays. for (k = 0; k < (isiz3 + 2); k++) { phi1[k] = malloc( (isiz2 + 2) * sizeof(fp_type) ); phi2[k] = malloc( (isiz2 + 2) * sizeof(fp_type) ); } ibeg = nx + 1; ifin = 0; iglob1 = ipt + 1; iglob2 = ipt + nx; if (iglob1 >= ii1 && iglob2 < ii2 + nx) { ibeg = 1; } if (iglob1 > ii1 - nx && iglob2 <= ii2) { ifin = nx; } if (ii1 >= iglob1 && ii1 <= iglob2) { ibeg = ii1 - ipt; } if (ii2 >= iglob1 && ii2 <= iglob2) { ifin = ii2 - ipt; } jbeg = ny + 1; jfin = 0; jglob1 = jpt + 1; jglob2 = jpt + ny; if (jglob1 >= ji1 && jglob2 < ji2 + ny) { jbeg = 1; } if (jglob1 > ji1 - ny && jglob2 <= ji2) { jfin = ny; } if (ji1 >= jglob1 && ji1 <= jglob2) { jbeg = ji1 - jpt; } if (ji2 >= jglob1 && ji2 <= jglob2) { jfin = ji2 - jpt; } ifin1 = ifin; jfin1 = jfin; if (ipt + ifin1 == ii2) { ifin1 = ifin - 1; } if (jpt + jfin1 == ji2) { jfin1 = jfin - 1; } // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; k = ki1 - 1; phi1[j-1][i-1] = c2 * ( u[k][j][i][4] - fphalf * ( u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + u[k][j][i][3] * u[k][j][i][3] ) / u[k][j][i][0] ); k = ki2 - 1; phi2[j-1][i-1] = c2 * ( u[k][j][i][4] - fphalf * ( u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + u[k][j][i][3] * u[k][j][i][3] ) / u[k][j][i][0] ); } } // Communicate in i and j directions. exchange_4(phi1, phi2, ibeg, ifin1, jbeg, jfin1); frc1 = fpzero; for (j = jbeg; j <= jfin1; j++) { for (i = ibeg; i <= ifin1; i++) { frc1 = frc1 + ( phi1[j][i] + phi1[j][i+1] + phi1[j+1][i] + phi1[j+1][i+1] + phi2[j][i] + phi2[j][i+1] + phi2[j+1][i] + phi2[j+1][i+1] ); } } // Compute the global sum of individual contributions to frc1. dummy = frc1; MPI_Allreduce(&dummy, &frc1, 1, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); frc1 = dxi * deta * frc1; // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } jglob = jpt + jbeg; ind1 = 0; if (jglob == ji1) { ind1 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; phi1[k+1][i-1] = c2 * ( u[k][jbeg+1][i][4] - fphalf * ( u[k][jbeg+1][i][1] * u[k][jbeg+1][i][1] + u[k][jbeg+1][i][2] * u[k][jbeg+1][i][2] + u[k][jbeg+1][i][3] * u[k][jbeg+1][i][3] ) / u[k][jbeg+1][i][0] ); } } } jglob = jpt + jfin; ind2 = 0; if (jglob == ji2) { ind2 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; phi2[k+1][i-1] = c2 * ( u[k][jfin+1][i][4] - fphalf * ( u[k][jfin+1][i][1] * u[k][jfin+1][i][1] + u[k][jfin+1][i][2] * u[k][jfin+1][i][2] + u[k][jfin+1][i][3] * u[k][jfin+1][i][3] ) / u[k][jfin+1][i][0] ); } } } // Communicate in i direction. if (ind1 == 1) { exchange_5(phi1, ibeg, ifin1); } if (ind2 == 1) { exchange_5(phi2, ibeg, ifin1); } frc2 = fpzero; for (k = ki1; k <= ki2 - 1; k++) { for (i = ibeg; i <= ifin1; i++) { frc2 = frc2 + ( phi1[k][i] + phi1[k][i+1] + phi1[k+1][i] + phi1[k+1][i+1] + phi2[k][i] + phi2[k][i+1] + phi2[k+1][i] + phi2[k+1][i+1] ); } } // Compute the global sum of individual contributions to frc2. dummy = frc2; MPI_Allreduce(&dummy, &frc2, 1, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); frc2 = dxi * dzeta * frc2; // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } iglob = ipt + ibeg; ind1 = 0; if (iglob == ii1) { ind1 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; phi1[k+1][j-1] = c2 * ( u[k][j][ibeg+1][4] - fphalf * ( u[k][j][ibeg+1][1] * u[k][j][ibeg+1][1] + u[k][j][ibeg+1][2] * u[k][j][ibeg+1][2] + u[k][j][ibeg+1][3] * u[k][j][ibeg+1][3] ) / u[k][j][ibeg+1][0] ); } } } iglob = ipt + ifin; ind2 = 0; if (iglob == ii2) { ind2 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; phi2[k+1][j-1] = c2 * ( u[k][j][ifin+1][4] - fphalf * ( u[k][j][ifin+1][1] * u[k][j][ifin+1][1] + u[k][j][ifin+1][2] * u[k][j][ifin+1][2] + u[k][j][ifin+1][3] * u[k][j][ifin+1][3] ) / u[k][j][ifin+1][0] ); } } } // Communicate in j direction. if (ind1 == 1) { exchange_6(phi1, jbeg, jfin1); } if (ind2 == 1) { exchange_6(phi2, jbeg, jfin1); } frc3 = fpzero; for (k = ki1; k <= ki2 - 1; k++) { for (j = jbeg; j <= jfin1; j++) { frc3 = frc3 + ( phi1[k][j] + phi1[k][j+1] + phi1[k+1][j] + phi1[k+1][j+1] + phi2[k][j] + phi2[k][j+1] + phi2[k+1][j] + phi2[k+1][j+1] ); } } // Compute the global sum of individual contributions to frc3. dummy = frc3; MPI_Allreduce(&dummy, &frc3, 1, MPI_FP_TYPE, MPI_SUM, MPI_COMM_WORLD); frc3 = deta * dzeta * frc3; frc = fpquarter * (frc1 + frc2 + frc3); if (id == 0) { printf(" Surface integral = %e.\n", frc); } // Free arrays. for (k = 0; k < (isiz3 + 2); k++) { free(phi1[k]); free(phi2[k]); } // Free phi1 and 2. free(phi1); free(phi2); } proc_grid.c0000666000175600017620000000231311353135232011356 0ustar sjpsjp /** * subroutine proc_grid */ #include #include #include "applu.h" /** * implicit none * * include 'applu.incl' */ /** * local variables */ /** * set up a two-d grid for processors: column-major ordering of unknowns * NOTE: assumes a power-of-two number of processors */ /** * num - number of nodes(processors) * nodedim - a function such that it compute the exponent where num = 2**nodedim * i.e. ndim is the squar-root of num */ void proc_grid(){ // xdim = 2**(ndim/2) xdim = pow(2, (ndim/2)); // if (mod(ndim,2).eq.1) xdim = xdim + xdim if(ndim%2 == 1){ xdim = xdim + xdim; } // ydim = num/xdim ydim = num/xdim; // write( *, 200) ndim //200 format(' ndim ', i4) if (id == 0) { printf(" ndim %d\n", ndim); // write( *, 210) xdim //210 format(' xdim ', i4) printf(" xdim %d\n", xdim); // write( *, 201) ydim //201 format(' ydim ', i4) printf(" ydim %d\n", ydim); } // row = mod(id,xdim) + 1 row = (id%xdim) + 1; // col = id/xdim + 1 col = (id/xdim) + 1; //printf("Rank %d: Row = %d, Column = %d.\n", id, row, col); // return } read_input.c0000666000175600017620000000635311477403774011570 0ustar sjpsjp#include "mpinpb.h" #include "applu.h" #include void read_input(){ int fstatus, nnodes; FILE *fp; char line[1024]; /** * only root reads the input file * if input file does not exist, it uses defaults * ipr = 1 for detailed progress output * inorm = how often the norm is printed (once every inorm iterations) * itmax = number of pseudo time steps * dt = time step * omega 1 over-relaxation factor for SSOR * tolrsd = steady state residual tolerance levels * nx, ny, nz = number of grid points in x, y, z directions */ root = 0; if(id == root){ fp = fopen("inputlu.data2", "r"); if(fp != NULL){ printf(" Reading from input file inputlu.data\n"); #ifdef SINGLE_PRECISION_LU fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\n", &ipr, &inorm); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\n", &itmax); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%f\n", &dt); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%f\n", &omega); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%f\t%f\t%f\t%f\t%f\n", &tolrsd[0], &tolrsd[1], &tolrsd[2], &tolrsd[3], &tolrsd[4]); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\t%d\n", &nx0, &ny0, &nz0); #endif #ifdef DOUBLE_PRECISION_LU fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\n", &ipr, &inorm); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\n", &itmax); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\n", &dt); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\n", &omega); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\t%lf\t%lf\t%lf\t%lf\n", &tolrsd[0], &tolrsd[1], &tolrsd[2], &tolrsd[3], &tolrsd[4]); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\t%d\n", &nx0, &ny0, &nz0); #endif fclose(fp); } else { printf(" Using defaults.\n"); ipr = ipr_default; inorm = inorm_default; itmax = itmax_default; dt = dt_default; omega = omega_default; tolrsd[0] = tolrsd1_def; tolrsd[1] = tolrsd2_def; tolrsd[2] = tolrsd3_def; tolrsd[3] = tolrsd4_def; tolrsd[4] = tolrsd5_def; nx0 = isiz01; ny0 = isiz02; nz0 = isiz03; } // Check problem size. MPI_Comm_size(MPI_COMM_WORLD, &nnodes); if(nnodes != nnodes_compiled) { printf(" Warning: program is running on %d processors but was compiled for %d\n", nnodes, nnodes_compiled); } if(nx0 < 4 || ny0 < 4 || nz0 < 4){ printf(" PROBLEM SIZE IS TOO SMALL - SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } if(nx0 > isiz01 || ny0 > isiz02 || nz0 > isiz03) { printf(" PROBLEM SIZE IS TOO LARGE - NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } printf(" NAS Parallel Benchmarks 3.2 -- LU Benchmark\n"); printf(" University of ÌÇÐÄTV CUDA Port\n"); printf(" Size: %d %d %d\n", nx0, ny0, nz0); printf(" Iterations: %d\n", itmax); printf(" Number of processors: %d\n", nnodes); } bcast_inputs(); } rhs.c0000666000175600017620000003477411544626037010234 0ustar sjpsjp// C port of NPB3.2 // subroutine rhs #include "applu.h" /** * Compute the right hand sides. */ void rhs() { /** * Local variables. */ int i, j, k, m; int iex; int L1, L2; int ist1, iend1; int jst1, jend1; fp_type q; fp_type u21, u31, u41; fp_type tmp; fp_type u21i, u31i, u41i, u51i; fp_type u21j, u31j, u41j, u51j; fp_type u21k, u31k, u41k, u51k; fp_type u21im1, u31im1, u41im1, u51im1; fp_type u21jm1, u31jm1, u41jm1, u51jm1; fp_type u21km1, u31km1, u41km1, u51km1; // Constants. const fp_type c1 = c1_def; const fp_type c2 = c2_def; const fp_type c3 = c3_def; const fp_type c4 = c4_def; const fp_type c5 = c5_def; const fp_type fpzero = 0.0e+00; const fp_type fphalf = 0.50e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; const fp_type fpthree = 3.0e+00; const fp_type fpfour = 4.0e+00; const fp_type fpfive = 5.0e+00; const fp_type fpsix = 6.0e+00; for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { for (i = 2; i <= nx + 1; i++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = -frct[k][j][i][m]; } } } } // xi-direction flux differences. /** * iex = flag : iex = 0 north/south communication * : iex = 1 east/west communication */ iex = 0; // Communicate and receive/send two rows of data. exchange_3(u, iex); L1 = 1; if (north == -1) { L1 = 2; } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j<= jend; j++) { for (i = L1; i <= L2; i++) { flux[k][j][i][0] = u[k][j][i][1]; u21 = u[k][j][i][1] / u[k][j][i][0]; q = fphalf * ( u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + u[k][j][i][3] * u[k][j][i][3] ) / u[k][j][i][0]; flux[k][j][i][1] = u[k][j][i][1] * u21 + c2 * ( u[k][j][i][4] - q ); flux[k][j][i][2] = u[k][j][i][2] * u21; flux[k][j][i][3] = u[k][j][i][3] * u21; flux[k][j][i][4] = ( c1 * u[k][j][i][4] - c2 * q ) * u21; } } } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - tx2 * ( flux[k][j][i+1][m] - flux[k][j][i-1][m] ); } } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (i = ist; i <= L2; i++) { tmp = fpone / u[k][j][i][0]; u21i = tmp * u[k][j][i][1]; u31i = tmp * u[k][j][i][2]; u41i = tmp * u[k][j][i][3]; u51i = tmp * u[k][j][i][4]; tmp = fpone / u[k][j][i-1][0]; u21im1 = tmp * u[k][j][i-1][1]; u31im1 = tmp * u[k][j][i-1][2]; u41im1 = tmp * u[k][j][i-1][3]; u51im1 = tmp * u[k][j][i-1][4]; flux[k][j][i][1] = ( fpfour / fpthree ) * tx3 * (u21i - u21im1); flux[k][j][i][2] = tx3 * ( u31i - u31im1 ); flux[k][j][i][3] = tx3 * ( u41i - u41im1 ); flux[k][j][i][4] = fphalf * ( fpone - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (fpone/fpsix) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } for (i = ist; i <= iend; i++) { rsd[k][j][i][0] = rsd[k][j][i][0] + dx1 * tx1 * ( u[k][j][i-1][0] - fptwo * u[k][j][i][0] + u[k][j][i+1][0] ); rsd[k][j][i][1] = rsd[k][j][i][1] + tx3 * c3 * c4 * ( flux[k][j][i+1][1] - flux[k][j][i][1] ) + dx2 * tx1 * ( u[k][j][i-1][1] - fptwo * u[k][j][i][1] + u[k][j][i+1][1] ); rsd[k][j][i][2] = rsd[k][j][i][2] + tx3 * c3 * c4 * ( flux[k][j][i+1][2] - flux[k][j][i][2] ) + dx3 * tx1 * ( u[k][j][i-1][2] - fptwo * u[k][j][i][2] + u[k][j][i+1][2] ); rsd[k][j][i][3] = rsd[k][j][i][3] + tx3 * c3 * c4 * ( flux[k][j][i+1][3] - flux[k][j][i][3] ) + dx4 * tx1 * ( u[k][j][i-1][3] - fptwo * u[k][j][i][3] + u[k][j][i+1][3] ); rsd[k][j][i][4] = rsd[k][j][i][4] + tx3 * c3 * c4 * ( flux[k][j][i+1][4] - flux[k][j][i][4] ) + dx5 * tx1 * ( u[k][j][i-1][4] - fptwo * u[k][j][i][4] + u[k][j][i+1][4] ); } /** * Fourth-order dissipation. */ if (north == -1) { for (m = 0; m < 5; m++) { rsd[k][j][3][m] = rsd[k][j][3][m] - dssp * ( + fpfive * u[k][j][3][m] - fpfour * u[k][j][4][m] + u[k][j][5][m] ); rsd[k][j][4][m] = rsd[k][j][4][m] - dssp * ( - fpfour * u[k][j][3][m] + fpsix * u[k][j][4][m] - fpfour * u[k][j][5][m] + u[k][j][6][m] ); } } ist1 = 2; iend1 = nx + 1; if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } for (i = ist1; i <= iend1; i++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - dssp * ( u[k][j][i-2][m] - fpfour * u[k][j][i-1][m] + fpsix * u[k][j][i][m] - fpfour * u[k][j][i+1][m] + u[k][j][i+2][m] ); } } if (south == -1) { for (m = 0; m < 5; m++) { rsd[k][j][nx-1][m] = rsd[k][j][nx-1][m] - dssp * ( u[k][j][nx-3][m] - fpfour * u[k][j][nx-2][m] + fpsix * u[k][j][nx-1][m] - fpfour * u[k][j][nx][m] ); rsd[k][j][nx][m] = rsd[k][j][nx][m] - dssp * ( u[k][j][nx-2][m] - fpfour * u[k][j][nx-1][m] + fpfive * u[k][j][nx][m] ); } } } } /** * eta-direction flux differences. */ // iex = flag : iex = 0 north/south communication iex = 1; // Communicate and receive/send two rows of data. exchange_3(u, iex); L1 = 1; if (west == -1) { L1 = 2; } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = L1; j <= L2; j++) { flux[k][j][i][0] = u[k][j][i][2]; u31 = u[k][j][i][2] / u[k][j][i][0]; q = fphalf * ( u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + u[k][j][i][3] * u[k][j][i][3] ) / u[k][j][i][0]; flux[k][j][i][1] = u[k][j][i][1] * u31; flux[k][j][i][2] = u[k][j][i][2] * u31 + c2 * ( u[k][j][i][4] - q ); flux[k][j][i][3] = u[k][j][i][3] * u31; flux[k][j][i][4] = ( c1 * u[k][j][i][4] - c2 * q ) * u31; } } } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = jst; j <= jend; j++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - ty2 * ( flux[k][j+1][i][m] - flux[k][j-1][i][m] ); } } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (j = jst; j <= L2; j++) { tmp = fpone / u[k][j][i][0]; u21j = tmp * u[k][j][i][1]; u31j = tmp * u[k][j][i][2]; u41j = tmp * u[k][j][i][3]; u51j = tmp * u[k][j][i][4]; tmp = fpone / u[k][j-1][i][0]; u21jm1 = tmp * u[k][j-1][i][1]; u31jm1 = tmp * u[k][j-1][i][2]; u41jm1 = tmp * u[k][j-1][i][3]; u51jm1 = tmp * u[k][j-1][i][4]; flux[k][j][i][1] = ty3 * ( u21j - u21jm1 ); flux[k][j][i][2] = (fpfour/fpthree) * ty3 * (u31j - u31jm1); flux[k][j][i][3] = ty3 * ( u41j - u41jm1 ); flux[k][j][i][4] = fphalf * ( fpone - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (fpone/fpsix) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } for (j = jst; j <= jend; j++) { rsd[k][j][i][0] = rsd[k][j][i][0] + dy1 * ty1 * ( u[k][j-1][i][0] - fptwo * u[k][j][i][0] + u[k][j+1][i][0] ); rsd[k][j][i][1] = rsd[k][j][i][1] + ty3 * c3 * c4 * ( flux[k][j+1][i][1] - flux[k][j][i][1] ) + dy2 * ty1 * ( u[k][j-1][i][1] - fptwo * u[k][j][i][1] + u[k][j+1][i][1] ); rsd[k][j][i][2] = rsd[k][j][i][2] + ty3 * c3 * c4 * (flux[k][j+1][i][2] - flux[k][j][i][2] ) + dy3 * ty1 * ( u[k][j-1][i][2] - fptwo * u[k][j][i][2] + u[k][j+1][i][2] ); rsd[k][j][i][3] = rsd[k][j][i][3] + ty3 * c3 * c4 * (flux[k][j+1][i][3] - flux[k][j][i][3] ) + dy4 * ty1 * ( u[k][j-1][i][3] - fptwo * u[k][j][i][3] + u[k][j+1][i][3] ); rsd[k][j][i][4] = rsd[k][j][i][4] + ty3 * c3 * c4 * (flux[k][j+1][i][4] - flux[k][j][i][4] ) + dy5 * ty1 * ( u[k][j-1][i][4] - fptwo * u[k][j][i][4] + u[k][j+1][i][4] ); } /** * Fourth-order dissipation. */ if (west == -1) { for (m = 0; m < 5; m++) { rsd[k][3][i][m] = rsd[k][3][i][m] - dssp * ( + fpfive * u[k][3][i][m] - fpfour * u[k][4][i][m] + u[k][5][i][m] ); rsd[k][4][i][m] = rsd[k][4][i][m] - dssp * ( - fpfour * u[k][3][i][m] + fpsix * u[k][4][i][m] - fpfour * u[k][5][i][m] + u[k][6][i][m] ); } } jst1 = 2; jend1 = ny + 1; if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } for (j = jst1; j <= jend1; j++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - dssp * ( u[k][j-2][i][m] - fpfour * u[k][j-1][i][m] + fpsix * u[k][j][i][m] - fpfour * u[k][j+1][i][m] + u[k][j+2][i][m] ); } } if (east == -1) { for (m = 0; m < 5; m++) { rsd[k][ny-1][i][m] = rsd[k][ny-1][i][m] - dssp * ( u[k][ny-3][i][m] - fpfour * u[k][ny-2][i][m] + fpsix * u[k][ny-1][i][m] - fpfour * u[k][ny][i][m] ); rsd[k][ny][i][m] = rsd[k][ny][i][m] - dssp * ( u[k][ny-2][i][m] - fpfour * u[k][ny-1][i][m] + fpfive * u[k][ny][i][m] ); } } } } /** * zeta-direction flux differences. */ for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (k = 0; k <= nz - 1; k++) { flux[k][j][i][0] = u[k][j][i][3]; u41 = u[k][j][i][3] / u[k][j][i][0]; q = fphalf * ( u[k][j][i][1] * u[k][j][i][1] + u[k][j][i][2] * u[k][j][i][2] + u[k][j][i][3] * u[k][j][i][3] ) / u[k][j][i][0]; flux[k][j][i][1] = u[k][j][i][1] * u41; flux[k][j][i][2] = u[k][j][i][2] * u41; flux[k][j][i][3] = u[k][j][i][3] * u41 + c2 * ( u[k][j][i][4] - q ); flux[k][j][i][4] = ( c1 * u[k][j][i][4] - c2 * q ) * u41; } for (k = 1; k <= nz - 2; k++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - tz2 * ( flux[k+1][j][i][m] - flux[k-1][j][i][m] ); } } for (k = 1; k <= nz - 1; k++) { tmp = fpone / u[k][j][i][0]; u21k = tmp * u[k][j][i][1]; u31k = tmp * u[k][j][i][2]; u41k = tmp * u[k][j][i][3]; u51k = tmp * u[k][j][i][4]; tmp = fpone / u[k-1][j][i][0]; u21km1 = tmp * u[k-1][j][i][1]; u31km1 = tmp * u[k-1][j][i][2]; u41km1 = tmp * u[k-1][j][i][3]; u51km1 = tmp * u[k-1][j][i][4]; flux[k][j][i][1] = tz3 * ( u21k - u21km1 ); flux[k][j][i][2] = tz3 * ( u31k - u31km1 ); flux[k][j][i][3] = (fpfour/fpthree) * tz3 * ( u41k - u41km1 ); flux[k][j][i][4] = fphalf * ( fpone - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (fpone/fpsix) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } for (k = 1; k <= nz - 2; k++) { rsd[k][j][i][0] = rsd[k][j][i][0] + dz1 * tz1 * ( u[k-1][j][i][0] - fptwo * u[k][j][i][0] + u[k+1][j][i][0] ); rsd[k][j][i][1] = rsd[k][j][i][1] + tz3 * c3 * c4 * ( flux[k+1][j][i][1] - flux[k][j][i][1] ) + dz2 * tz1 * ( u[k-1][j][i][1] - fptwo * u[k][j][i][1] + u[k+1][j][i][1] ); rsd[k][j][i][2] = rsd[k][j][i][2] + tz3 * c3 * c4 * ( flux[k+1][j][i][2] - flux[k][j][i][2] ) + dz3 * tz1 * ( u[k-1][j][i][2] - fptwo * u[k][j][i][2] + u[k+1][j][i][2] ); rsd[k][j][i][3] = rsd[k][j][i][3] + tz3 * c3 * c4 * ( flux[k+1][j][i][3] - flux[k][j][i][3] ) + dz4 * tz1 * ( u[k-1][j][i][3] - fptwo * u[k][j][i][3] + u[k+1][j][i][3] ); rsd[k][j][i][4] = rsd[k][j][i][4] + tz3 * c3 * c4 * ( flux[k+1][j][i][4] - flux[k][j][i][4] ) + dz5 * tz1 * ( u[k-1][j][i][4] - fptwo * u[k][j][i][4] + u[k+1][j][i][4] ); } /** * Fourth-order dissipation. */ for (m = 0; m < 5; m++) { rsd[1][j][i][m] = rsd[1][j][i][m] - dssp * ( + fpfive * u[1][j][i][m] - fpfour * u[2][j][i][m] + u[3][j][i][m] ); rsd[2][j][i][m] = rsd[2][j][i][m] - dssp * ( - fpfour * u[1][j][i][m] + fpsix * u[2][j][i][m] - fpfour * u[3][j][i][m] + u[4][j][i][m] ); } for (k = 3; k <= nz - 4; k++) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = rsd[k][j][i][m] - dssp * ( u[k-2][j][i][m] - fpfour * u[k-1][j][i][m] + fpsix * u[k][j][i][m] - fpfour * u[k+1][j][i][m] + u[k+2][j][i][m] ); } } for (m = 0; m < 5; m++) { rsd[nz-3][j][i][m] = rsd[nz-3][j][i][m] - dssp * ( u[nz-5][j][i][m] - fpfour * u[nz-4][j][i][m] + fpsix * u[nz-3][j][i][m] - fpfour * u[nz-2][j][i][m] ); rsd[nz-2][j][i][m] = rsd[nz-2][j][i][m] - dssp * ( u[nz-4][j][i][m] - fpfour * u[nz-3][j][i][m] + fpfive * u[nz-2][j][i][m] ); } } } } setbv.c0000666000175600017620000000254711353135233010543 0ustar sjpsjp// C port of NPB3.2 // subroutine setbv #include "applu.h" /** * Set the boundary values of dependent variables. */ void setbv() { /** * Local variables. */ int i, j, k; int iglob, jglob; /** * Set the dependent variable values along the top and bottom faces. */ for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, jglob, 0, &u[0][j][i][0]); exact(iglob, jglob, nz - 1, &u[nz-1][j][i][0]); } } /** * Set the dependent variable values along north and south faces. */ if (west == -1 ) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, 2, k, &u[k][2][i][0]); } } } if (east == -1) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, ny0 + 1, k, &u[k][ny+1][i][0]); } } } /** * Set the dependent variable values along east and west faces. */ if (north == -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; exact(2, jglob, k, &u[k][j][2][0]); } } } if (south == -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; exact(nx0 + 1, jglob, k, &u[k][j][nx+1][0]); } } } } setcoeff.c0000666000175600017620000000526211544626055011224 0ustar sjpsjp// C port of NPB3.2 // subroutine setcoeff #include "applu.h" // Define max function. #define max2(x,y) ( (x > y) ? x : y ) #define max3(x,y,z) ( max2( max2(x, y), z ) ) /** * Setup coefficients. */ void setcoeff() { dxi = 1.0e+00 / ( nx0 - 1 ); deta = 1.0e+00 / ( ny0 - 1 ); dzeta = 1.0e+00 / ( nz0 - 1 ); tx1 = 1.0e+00 / ( dxi * dxi ); tx2 = 1.0e+00 / ( 2.0e+00 * dxi ); tx3 = 1.0e+00 / dxi; ty1 = 1.0e+00 / ( deta * deta ); ty2 = 1.0e+00 / ( 2.0e+00 * deta ); ty3 = 1.0e+00 / deta; tz1 = 1.0e+00 / ( dzeta * dzeta ); tz2 = 1.0e+00 / ( 2.0e+00 * dzeta ); tz3 = 1.0e+00 / dzeta; ii1 = 2; ii2 = nx0 - 1; ji1 = 2; ji2 = ny0 - 2; ki1 = 3; ki2 = nz0 - 1; // Diffusion coefficients. dx1 = 0.75e+00; dx2 = dx1; dx3 = dx1; dx4 = dx1; dx5 = dx1; dy1 = 0.75e+00; dy2 = dy1; dy3 = dy1; dy4 = dy1; dy5 = dy1; dz1 = 1.00e+00; dz2 = dz1; dz3 = dz1; dz4 = dz1; dz5 = dz1; // Fourth difference dissipation. dssp = ( max3 (dx1, dy1, dz1 ) ) / 4.0e+00; //dssp = 1.00e+00 / 4.0e+00; // We know that dz1 is the maximum. // Coefficients of the exact solution to the first pde. ce[0][0] = 2.0e+00; ce[1][0] = 0.0e+00; ce[2][0] = 0.0e+00; ce[3][0] = 4.0e+00; ce[4][0] = 5.0e+00; ce[5][0] = 3.0e+00; ce[6][0] = 5.0e-01; ce[7][0] = 2.0e-02; ce[8][0] = 1.0e-02; ce[9][0] = 3.0e-02; ce[10][0] = 5.0e-01; ce[11][0] = 4.0e-01; ce[12][0] = 3.0e-01; // Coefficients of the exact solution to the second pde. ce[0][1] = 1.0e+00; ce[1][1] = 0.0e+00; ce[2][1] = 0.0e+00; ce[3][1] = 0.0e+00; ce[4][1] = 1.0e+00; ce[5][1] = 2.0e+00; ce[6][1] = 3.0e+00; ce[7][1] = 1.0e-02; ce[8][1] = 3.0e-02; ce[9][1] = 2.0e-02; ce[10][1] = 4.0e-01; ce[11][1] = 3.0e-01; ce[12][1] = 5.0e-01; // Coefficients of the exact solution to the third pde. ce[0][2] = 2.0e+00; ce[1][2] = 2.0e+00; ce[2][2] = 0.0e+00; ce[3][2] = 0.0e+00; ce[4][2] = 0.0e+00; ce[5][2] = 2.0e+00; ce[6][2] = 3.0e+00; ce[7][2] = 4.0e-02; ce[8][2] = 3.0e-02; ce[9][2] = 5.0e-02; ce[10][2] = 3.0e-01; ce[11][2] = 5.0e-01; ce[12][2] = 4.0e-01; // Coefficients of the exact solution to the fourth pde. ce[0][3] = 2.0e+00; ce[1][3] = 2.0e+00; ce[2][3] = 0.0e+00; ce[3][3] = 0.0e+00; ce[4][3] = 0.0e+00; ce[5][3] = 2.0e+00; ce[6][3] = 3.0e+00; ce[7][3] = 3.0e-02; ce[8][3] = 5.0e-02; ce[9][3] = 4.0e-02; ce[10][3] = 2.0e-01; ce[11][3] = 1.0e-01; ce[12][3] = 3.0e-01; // Coefficients of the exact solution to the fifth pde. ce[0][4] = 5.0e+00; ce[1][4] = 4.0e+00; ce[2][4] = 3.0e+00; ce[3][4] = 2.0e+00; ce[4][4] = 1.0e-01; ce[5][4] = 4.0e-01; ce[6][4] = 3.0e-01; ce[7][4] = 5.0e-02; ce[8][4] = 4.0e-02; ce[9][4] = 3.0e-02; ce[10][4] = 1.0e-01; ce[11][4] = 3.0e-01; ce[12][4] = 2.0e-01; } sethyper.c0000666000175600017620000000357311477730407011276 0ustar sjpsjp// C port of NPB3.2 // subroutine sethyper #include "applu.h" /** * Fore each column in a hyperplane, istart = first row, */ void sethyper() { /** * Local variables. */ int i, j; int iglob, jglob; int kp; /** * Compute the pointers for hyperplanes. */ for (kp = 2; kp <= nx0 + ny0; kp++) { icomms[kp-1] = 0; icommn[kp-1] = 0; icomme[kp-1] = 0; icommw[kp-1] = 0; /** * Check to see if comm. to south is required. */ if (south != -1) { i = iend; iglob = ipt + i; jglob = kp - iglob; j = jglob - jpt; if (jglob >= 2 && jglob <= ny0 - 1 && j >= jst && j <= jend) { icomms[kp-1] = 1; } } /** * Check to see if comm. to north is required. */ if (north != -1) { i = ist; iglob = ipt + i; jglob = kp - iglob; j = jglob - jpt; if (jglob >= 2 && jglob <= ny0 - 1 && j >= jst && j <= jend) { icommn[kp-1] = 1; } } /** * Check to see if comm. to east is required. */ if (east != -1) { j = jend; jglob = jpt + j; iglob = kp - jglob; i = iglob - ipt; if (iglob >= 2 && iglob <= nx0 - 1 && i >= ist && i <= iend) { icomme[kp-1] = 1; } } /** * Check to see if comm. to west is required. */ if (west != -1) { j = jst; jglob = jpt + j; iglob = kp - jglob; i = iglob - ipt; if (iglob >= 2 && iglob <= nx0 - 1 && i >= ist && i <= iend) { icommw[kp-1] = 1; } } } //intf("Memory issues: %d, %d, %d, %d, %d, %d, %d, %d, %d\n", &icomms[0], &icommn[0], &icomme[0], &icommw[0], &icomms[nx0 + ny0 + 1], &icommn[nx0 + ny0 + 1], &icomme[nx0 + ny0 + 1], &icommw[nx0 + ny0 + 1], &dzeta); //printf("npmax = %d\n", npmax); //printf("nx0 + ny0 + 1 = %d\n", nx0 + ny0 + 1); icomms[0] = 0; icommn[0] = 0; icomme[0] = 0; icommw[0] = 0; icomms[nx0 + ny0] = 0; icommn[nx0 + ny0] = 0; icomme[nx0 + ny0] = 0; icommw[nx0 + ny0] = 0; } setiv.c0000666000175600017620000000315211440541544010546 0ustar sjpsjp// C port of NPB3.2 // subroutine setiv #include "applu.h" /** * Set the initial values of independent variables based on tri-linear * interpolation of boundary values in the computational space. */ void setiv() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; fp_type xi, eta, zeta; fp_type pxi, peta, pzeta; fp_type ue_1jk[5], ue_nx0jk[5], ue_i1k[5], ue_iny0k[5], ue_ij1[5], ue_ijnz[5]; const fp_type fpone = 1.0e+00; for (k = 1; k <= nz - 2; k++) { zeta = ( (fp_type) k ) / (nz - 1); for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; if (jglob != 2 && jglob != ny0 + 1) { eta = ( (fp_type) (jglob - 2) ) / (ny0 - 1); for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; if (iglob != 2 && iglob != nx0 + 1) { xi = ( (fp_type) (iglob - 2) ) / (nx0 - 1); // Find exact solutions for all i, j, k. exact(2, jglob, k, ue_1jk); exact(nx0 + 1, jglob, k, ue_nx0jk); exact(iglob, 2, k, ue_i1k); exact(iglob, ny0 + 1, k, ue_iny0k); exact(iglob, jglob, 0, ue_ij1); exact(iglob, jglob, nz - 1, ue_ijnz); for (m = 0; m < 5; m++) { pxi = (fpone - xi) * ue_1jk[m] + xi * ue_nx0jk[m]; peta = (fpone - eta) * ue_i1k[m] + eta * ue_iny0k[m]; pzeta = (fpone - zeta) * ue_ij1[m] + zeta * ue_ijnz[m]; u[k][j][i][m] = pxi + peta + pzeta - pxi * peta - peta * pzeta - pzeta * pxi + pxi * peta * pzeta; } } } } } } } setparams.c0000644000175600017620000001353611502131337011410 0ustar sjpsjp// C port of NPB3.2 #define VERSION "3.2" #define FILENAME "size.h" #include #include #define max(x,y) ((x > y) ? x : y) /** * Integer log base two. * Return error if argument isn't a power of two or is less than or equal to zero. */ int ilog2(int i) { int log2; int exp2 = 1; if (i <= 0) { return -1; } for (log2 = 0; log2 < 20; log2++) { if (exp2 == i) { return log2; } exp2 *= 2; } return -1; } /** * Integer power of two. */ int ipow2(int i) { int pow2 = 1; if (i < 0) { return -1; } if (i == 0) { return 1; } while (i--) { pow2 *= 2; } return pow2; } /** * Creates an appropriate size.h file for a given class and number of processors. */ int main(int argc, char* argv[]) { // Check we have the right number of command line arguments. if (argc != 14) { //printf("Usage: ./setparams [number of processors] [problem class] [k-blocking] [gpu-blocking] [gpu tile size] [gpu grid size] [rhs-blocking] [rhs tile size] [rhs grid size]\n"); printf("Usage: ./setparams [number of processors] [problem class] [k-blocking] [gpublock x] [gpublock y] [gpublock z] [gpugrid x] [gpugrid y] [rhsblock x] [rhsblock y] [rhsblock z] [rhsgrid x] [rhsgrid y]\n"); return -1; } // Define some variables for later. int nprocs, class, kblock, gpublock_x, gpublock_y, gpublock_z, gpugrid_x, gpugrid_y; int rhsblock_x, rhsblock_y, rhsblock_z, rhsgrid_x, rhsgrid_y; int isiz1, isiz2, itmax, inorm, problem_size; int xdiv, ydiv; char* dt_default; // Read the command line arguments. /*nprocs = atoi(argv[1]); class = *argv[2]; kblock = atoi(argv[3]); gpublock = atoi(argv[4]); gputile = atoi(argv[5]); gpugrid = atoi(argv[6]); rhsblock = atoi(argv[7]); rhstile = atoi(argv[8]); rhsgrid = atoi(argv[9]);*/ nprocs = atoi(argv[1]); class = *argv[2]; kblock = atoi(argv[3]); gpublock_x = atoi(argv[4]); gpublock_y = atoi(argv[5]); gpublock_z = atoi(argv[6]); gpugrid_x = atoi(argv[7]); gpugrid_y = atoi(argv[8]); rhsblock_x = atoi(argv[9]); rhsblock_y = atoi(argv[10]); rhsblock_z = atoi(argv[11]); rhsgrid_x = atoi(argv[12]); rhsgrid_y = atoi(argv[13]); // Open a size.h file for writing. FILE* file; file = fopen(FILENAME, "w"); if (!file) { printf("Cannot open %s for writing.\n", FILENAME); return -1; } // Write the header. fprintf(file, "/**\n"); fprintf(file, " * Define the problem and sub-problem sizes.\n"); fprintf(file, " */\n"); fprintf(file, "\n"); // Set the defaults for a given class. if (class == 'S') { problem_size = 12; dt_default = "0.5e+00"; itmax = 50; } else if (class == 'W') { problem_size = 33; dt_default = "1.5e-03"; itmax = 300; } else if (class == 'A') { problem_size = 64; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'B') { problem_size = 102; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'C') { problem_size = 162; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'D') { problem_size = 408; dt_default = "1.0e+00"; itmax = 300; } else if (class == 'E') { problem_size = 1020; dt_default = "0.5e+00"; itmax = 300; } else { printf("Unknown class %c.\n", class); fclose(file); return -1; } // Calculate the sub-problem size. inorm = itmax; xdiv = ydiv = ilog2(nprocs) / 2; if (xdiv + ydiv != ilog2(nprocs)) { xdiv += 1; } xdiv = ipow2(xdiv); ydiv = ipow2(ydiv); isiz1 = problem_size / xdiv; if (isiz1 * xdiv < problem_size) { isiz1++; } isiz2 = problem_size / ydiv; if (isiz2 * ydiv < problem_size) { isiz2++; } // Write the number of processors. fprintf(file, "// Number of processors compiled for.\n"); fprintf(file, "#define nnodes_compiled %d\n", nprocs); fprintf(file, "\n"); // Write the full array size. fprintf(file, "// Full array size.\n"); fprintf(file, "#define isiz01 %d\n", problem_size); fprintf(file, "#define isiz02 %d\n", problem_size); fprintf(file, "#define isiz03 %d\n", problem_size); fprintf(file, "\n"); // Write the array size per processor. fprintf(file, "// Array size per processor.\n"); fprintf(file, "#define isiz1 %d\n", isiz1); fprintf(file, "#define isiz2 %d\n", isiz2); fprintf(file, "#define isiz3 isiz03\n"); fprintf(file, "\n"); // Write the other information. fprintf(file, "// Number of iterations, etc.\n"); fprintf(file, "#define itmax_default %d\n", itmax); fprintf(file, "#define inorm_default %d\n", inorm); fprintf(file, "#define dt_default %s\n", dt_default); fprintf(file, "\n"); // Write the kblocking and gpu information. fprintf(file, "#define kblock %d\n", kblock); //fprintf(file, "#define problem_height %d\n", max(kblock, problem_size)); // Defining problem height as the amount of memory that should be allocated in z direction for ursd. if (kblock >= problem_size) { fprintf(file, "#define problem_height %d\n", kblock); } else if ( (problem_size % kblock) != 0 ) { fprintf(file, "#define problem_height %d\n", ((problem_size / kblock) * kblock) + kblock); } else { fprintf(file, "#define problem_height %d\n", problem_size); } fprintf(file, "#define gpublock_x %d\n", gpublock_x); fprintf(file, "#define gpublock_y %d\n", gpublock_y); fprintf(file, "#define gpublock_z %d\n", gpublock_z); fprintf(file, "#define gpugrid_x %d\n", gpugrid_x); fprintf(file, "#define gpugrid_y %d\n", gpugrid_y); fprintf(file, "#define rhsblock_x %d\n", rhsblock_x); fprintf(file, "#define rhsblock_y %d\n", rhsblock_y); fprintf(file, "#define rhsblock_z %d\n", rhsblock_z); fprintf(file, "#define rhsgrid_x %d\n", rhsgrid_x); fprintf(file, "#define rhsgrid_y %d\n", rhsgrid_y); /*fprintf(file, "#define gpublock %d\n", gpublock); fprintf(file, "#define gputile %d\n", gputile); fprintf(file, "#define gpugrid %d\n", gpugrid); fprintf(file, "#define rhstile %d\n", rhstile); fprintf(file, "#define rhsgrid %d\n", rhsgrid); fprintf(file, "#define rhsblock %d\n", rhsblock);*/ // Close the file and exit cleanly. fclose(file); return 0; } ssor.c0000646000175600017620000001345011477731071010410 0ustar sjpsjp// C port of NPB3.2 // subroutine ssor(niter) #include "applu.h" #include "mpinpb.h" #include "timers.h" #include #include /** * To perform pseudo-time stepping SSOR iterations for give nonlinear pde's. */ void ssor(int niter) { /** * Local variables. */ int i, j, k, m, l; int istep; fp_type tmp; fp_type delunm[5], ***tv; root = 0; // Allocate memory for tv. tv = (fp_type***) malloc( (isiz2 + 4) * sizeof(fp_type**) ); for (k = 0; k < (isiz2 + 4); k++) { tv[k] = (fp_type**) malloc ( (isiz1 + 4) * sizeof(fp_type*) ); } for (k = 0; k < (isiz2 + 4); k++) { for (j = 0; j < (isiz1 + 4); j++) { tv[k][j] = (fp_type*) malloc ( 5 * sizeof(fp_type) ); } } // Begin pseudo-time stepping iterations. const fp_type fpzero = 0.0e+00; const fp_type fpone = 1.0e+00; const fp_type fptwo = 2.0e+00; tmp = fpone / ( omega * ( fptwo - omega ) ); // Initialize a, b, c, d to zero (guarantees that page tables have been formed, if // applicable on given architecture, before timestepping. for (k = 0; k < kblock; k++) { for (j = 0; j < (isiz2 + 4); j++) { for (i = 0; i < (isiz1 + 4); i++) { for (l = 0; l < 5; l++) { for (m = 0; m < 5; m++) { a[k][j][i][l][m] = fpzero; b[k][j][i][l][m] = fpzero; c[k][j][i][l][m] = fpzero; d[k][j][i][l][m] = fpzero; } } } } } // Compute the steady-state residuals. rhs(); // Compute the L2 norms of newton iteration residuals. l2norm(isiz1, isiz2, isiz3, nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm); MPI_Barrier(MPI_COMM_WORLD); // Make sure all aggregate timers are initialised. timer_reset(); // Start the timer for the ssor loop. timer_start(0); // Start the timestep loop - do for niter times. for (istep = 1; istep <= niter; istep++) { if (niter == 1) { printf(" k-blocking factor of %d.\n", kblock); } if (id == 0) { if (istep % 20 == 0 || istep == itmax || istep == 1) { if (niter > 1) { printf(" Pseudo-time SSOR iteration no. = %d.\n", istep); } } } // Perform SSOR iteration. timer_start(1); for (k = 0; k < nz; k++) { for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { rsd[k][j][i][m] = dt * rsd[k][j][i][m]; } } } } } timer_stop(1); // For each tile in k-dimension (from bottom to top). for (k = 0; k < nz; k += kblock) { // Form the lower triangular part of the jacobian matrix. timer_start(2); jacld(k); timer_stop(2); // Perform the lower triangular solution. timer_start(3); blts(isiz1, isiz2, isiz3, nx, ny, nz, k, omega, rsd, a, b, c, d, ist, iend, jst, jend, nx0, ny0, ipt, jpt); timer_stop(3); } // For each tile in k-dimension (from top to bottom). for (k = nz - 1; k >= 0; k -= kblock) { // Form the strictly upper triangular part of the jacobian matrix. timer_start(4); jacu(k); timer_stop(4); // Perform the upper triangular solution. timer_start(5); buts(isiz1, isiz2, isiz3, nx, ny, nz, k, omega, rsd, tv, d, a, b, c, ist, iend, jst, jend, nx0, ny0, ipt, jpt); timer_stop(5); } // Update the variables. timer_start(6); for (k = 0; k < nz; k++) { for (j = 0; j < ny + 4; j++) { for (i = 0; i < nx + 4; i++) { if (k >= 1 && k <= nz - 2 && j >= jst && j <= jend && i >= ist && i <= iend) { for (m = 0; m < 5; m++) { u[k][j][i][m] = u[k][j][i][m] + tmp * rsd[k][j][i][m]; } } } } } timer_stop(6); // Compute the max-norms of newton iteration corrections. // l2norm seems to contain a collective MPI operation. if (istep % inorm == 0) { l2norm(isiz1, isiz2, isiz3, nx0, ny0, nz0, ist, iend, jst, jend, rsd, delunm); if (ipr == 1 && id == 0) { printf(" RMS-norm of SSOR-iteration correction for first pde = %e.\n", delunm[0]); printf(" RMS-norm of SSOR-iteration correction for second pde = %e.\n", delunm[1]); printf(" RMS-norm of SSOR-iteration correction for third pde = %e.\n", delunm[2]); printf(" RMS-norm of SSOR-iteration correction for fourth pde = %e.\n", delunm[3]); printf(" RMS-norm of SSOR-iteration correction for fifth pde = %e.\n", delunm[4]); } else if (ipr == 2 && id == 0) { printf(" %d, %f.\n", istep, delunm[4]); } } // Compute the steady-state residuals. timer_start(8); rhs(); timer_stop(8); // Compute the max-norms of newton iteration residuals. // l2norm seems to contain a collective MPI operation. if (istep % inorm == 0 || istep == itmax ) { l2norm(isiz1, isiz2, isiz3, nx0, ny0, nz0, ist, iend, jst, jend, rsd, rsdnm); if (ipr == 1 && id == 0) { printf(" RMS-norm of steady state residual for first pde = %e.\n", rsdnm[0]); printf(" RMS-norm of steady state residual for second pde = %e.\n", rsdnm[1]); printf(" RMS-norm of steady state residual for third pde = %e.\n", rsdnm[2]); printf(" RMS-norm of steady state residual for fourth pde = %e.\n", rsdnm[3]); printf(" RMS-norm of steady state residual for fifth pde = %e.\n", rsdnm[4]); } } // Check the newton-iteration residuals against the tolerance levels. if (rsdnm[0] < tolrsd[0] && rsdnm[1] < tolrsd[1] && rsdnm[2] < tolrsd[2] && rsdnm[3] < tolrsd[3] && rsdnm[4] < tolrsd[4]) { if (ipr == 1 && id == 0) { printf(" Convergence was achieved after %d pseudo-time steps.\n", istep); return; } } } // End the timers. timer_stop(0); // Free memory for tv. for (k = 0; k < (isiz2 + 4); k++) { for (j = 0; j < (isiz1 + 4); j++) { free(tv[k][j]); } } for (k = 0; k < (isiz2 + 4); k++) { free(tv[k]); } free(tv); } subdomain.c0000666000175600017620000000333711477417740011414 0ustar sjpsjp// C port of NPB3.2 // subroutine subdomain #include "applu.h" #include "mpinpb.h" #include /** * Set up the sub-domain sizes. */ void subdomain() { /** * Local variables. */ int mm, ierror, errorcode; // x dimension mm = nx0 % xdim; if (row <= mm) { nx = nx0 / xdim + 1; ipt = (row - 1) * nx; } else { nx = nx0 / xdim; ipt = (row - 1) * nx + mm; } // y dimension mm = ny0 % ydim; if (col <= mm) { ny = ny0 / ydim + 1; jpt = (col - 1) * ny; } else { ny = ny0 / ydim; jpt = (col - 1) * ny + mm; } // z dimension nz = nz0; // Check the sub-domain size. if (nx < 4 || ny < 4 || nz < 4 ) { printf(" Subdomain size is too small - adjust problem size or number of processors so that "); printf(" nx, ny and nz are greater than or equal to 4. They are currently %d, %d, %d.\n", nx, ny, nz); MPI_Abort(MPI_COMM_WORLD, 1); } if (nx > isiz1 || ny > isiz2 || nz > isiz3) { printf(" Subdomain size is too large - adjust problem size or number of processors so that "); printf(" nx, ny and nz are less than or equal to isiz1, isiz2 and isiz3 respectively. They are "); printf(" currently %d, %d, %d.\n", nx, ny, nz); MPI_Abort(MPI_COMM_WORLD, 1); } if (id == 0) { printf(" Subdomain: %d x %d x %d.\n", nx, ny, nz); } // Set up the start and end in i and j extents for all processors. // Originals: /*ist = 1; iend = nx; if (north == -1) { ist = 2; } if (south == -1) { iend = nx - 1; } jst = 1; jend = ny; if (west == -1) { jst = 2; } if (east == -1) { jend = ny - 1; }*/ // New versions. ist = 2; iend = nx + 1; if (north == -1) { ist = 3; } if (south == -1) { iend = nx; } jst = 2; jend = ny + 1; if (west == -1) { jst = 3; } if (east == -1) { jend = ny; } } timers.c0000644000175600017620000000565611477420470010732 0ustar sjpsjp#include "timers.h" #include #include #include #include #include // Array of timers. Timer* timers; int no_timers; /** * Return the current cpu and wall time. */ void timer(double* cpu, double* et) { struct rusage r; struct timeval t; getrusage( RUSAGE_SELF, &r ); *cpu = r.ru_utime.tv_sec + r.ru_utime.tv_usec*1.0e-6; gettimeofday( &t, (struct timezone *)0 ); *et = t.tv_sec + t.tv_usec*1.0e-6; } /** * Create the timers. */ void timer_init(int number) { // Allocate an array of timers. timers = (Timer*) malloc(number * sizeof(Timer)); no_timers = number; // Initialise them all to 0. int i; for (i = 0; i < no_timers; i++) { timer_reset(i); } } /** * Reset all timers. */ void timer_reset() { int i; for (i = 0; i < no_timers; i++) { timers[i].cpu_start = 0; timers[i].cpu_end = 0; timers[i].wall_start = 0; timers[i].wall_end = 0; timers[i].cpu_total = 0; timers[i].wall_total = 0; timers[i].calls = 0; } } /** * Start a timer. */ void timer_start(int i) { timer(&timers[i].cpu_start, &timers[i].wall_start); } /** * Stop a timer. */ void timer_stop(int i) { // Update the timer. timer(&timers[i].cpu_end, &timers[i].wall_end); timers[i].cpu_total = timers[i].cpu_total + (timers[i].cpu_end - timers[i].cpu_start); timers[i].wall_total = timers[i].wall_total + (timers[i].wall_end - timers[i].wall_start); timers[i].calls = timers[i].calls + 1; } /** * Pause a timer. */ void timer_pause(int i) { timer(&timers[i].cpu_end, &timers[i].wall_end); timers[i].cpu_total = timers[i].cpu_total + (timers[i].cpu_end - timers[i].cpu_start); timers[i].wall_total = timers[i].wall_total + (timers[i].wall_end - timers[i].wall_start); } /** * Restart a timer. */ void timer_restart(int i) { timer(&timers[i].cpu_start, &timers[i].wall_start); } /** * Clean up. */ void timer_finalize() { free(timers); } /** * Return the CPU timer total. */ double timer_cpu_total(int i) { return timers[i].cpu_total; } /** * Return the CPU timer average. */ double timer_cpu_avg(int i) { if (timers[i].calls == 0) { return 0; } return timers[i].cpu_total / timers[i].calls; } /** * Return the Wall timer total. */ double timer_wall_total(int i) { return timers[i].wall_total; } /** * Return the Wall timer average. */ double timer_wall_avg(int i) { if (timers[i].calls == 0) { return 0; } return timers[i].wall_total / timers[i].calls; } /** * Print the timer format information. */ void timer_print_format() { printf(" NAME :\tCPU TOTAL\tWALL TOTAL\tCPU AVG\tWALL AVG\tCALLS\n"); } /** * Print the timer information in a sensible way. */ void timer_print(const char* name, int i) { // Prints as 'Name: CPU_TOTAL WALL_TOTAL CPU_AVG WALL_AVG CALLS printf(" %s:\t%f\t%f\t%f\t%f\t%d\n", name, timer_cpu_total(i), timer_wall_total(i), timer_cpu_avg(i), timer_wall_avg(i), timers[i].calls); } util.c0000644000175600017620000000101411477673643010400 0ustar sjpsjp#include #include #include "applu.h" #include "timers.h" /** * Print all of the timers. */ void print_timers() { printf(" TIMING RESULTS:\n"); timer_print_format(); timer_print("SSOR ", 0); timer_print("PRE ", 1); timer_print("JACLD ", 2); timer_print("BLTS ", 3); timer_print("JACU ", 4); timer_print("BUTS ", 5); timer_print("POST ", 6); timer_print("L2NORM", 7); timer_print("RHS ", 8); timer_print("REARR ", 9); timer_print("EX1 ", 10); timer_print("EX3 ", 11); } verify.c0000666000175600017620000002150611477713111010724 0ustar sjpsjp// C port of NPB3.2 // subroutine verify(xcr, xce, xci, class, verified) #include "applu.h" #include "mpinpb.h" #include #include /** * Verification routine. * Returns 1 / 0 depending on success / failure. */ int verify(fp_type xcr[5], fp_type xce[5], fp_type xci, char class) { /** * Local variables. */ fp_type xcrref[5], xceref[5], xciref, xcrdif[5], xcedif[5], xcidif, epsilon, dtref; int m; int verified; // Tolerance level. epsilon = 1.0e-08; class = 'U'; verified = 1; for (m = 0; m < 5; m++) { xcrref[m] = 1.0; xceref[m] = 1.0; } xciref = 1.0; // Check for CLASS = S. if (nx0 == 12 && ny0 == 12 && nz0 == 12 && itmax == 50) { class = 'S'; dtref = 5.0e-1; // Reference values of RMS-norms of residual, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1; xcrref[0] = 1.6196343210976702e-02; xcrref[1] = 2.1976745164821318e-03; xcrref[2] = 1.5179927653399185e-03; xcrref[3] = 1.5029584435994323e-03; xcrref[4] = 3.4264073155896461e-02; // Reference values of RMS-norms of solution error, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1. xceref[0] = 6.4223319957960924e-04; xceref[1] = 8.4144342047347926e-05; xceref[2] = 5.8588269616485186e-05; xceref[3] = 5.8474222595157350e-05; xceref[4] = 1.3103347914111294e-03; // Reference values of surface integral, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1. xciref = 7.8418928865937083e+00; } // Check for CLASS = W. else if (nx0 == 33 && ny0 == 33 && nz0 == 33 && itmax == 300) { class = 'W'; dtref = 1.5e-3; // Reference values of RMS-norms of residual, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xcrref[0] = 0.1236511638192e+02; xcrref[1] = 0.1317228477799e+01; xcrref[2] = 0.2550120713095e+01; xcrref[3] = 0.2326187750252e+01; xcrref[4] = 0.2826799444189e+02; // Reference values for RMS-norms of solution error, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xceref[0] = 0.4867877144216e+00; xceref[1] = 0.5064652880982e-01; xceref[2] = 0.9281818101960e-01; xceref[3] = 0.8570126542733e-01; xceref[4] = 0.1084277417792e+01; // Reference value of surface integral, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xciref = 0.1161399311023e+02; } // Check for CLASS = A. else if (nx0 == 64 && ny0 == 64 && nz0 == 64 && itmax == 250) { class = 'A'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0e+00. xcrref[0] = 7.7902107606689367e+02; xcrref[1] = 6.3402765259692870e+01; xcrref[2] = 1.9499249727292479e+02; xcrref[3] = 1.7845301160418537e+02; xcrref[4] = 1.8384760349464247e+03; // Reference values of RMS-norms of solution error, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0d+00. xceref[0] = 2.9964085685471943e+01; xceref[1] = 2.8194576365003349e+00; xceref[2] = 7.3473412698774742e+00; xceref[3] = 6.7139225687777051e+00; xceref[4] = 7.0715315688392578e+01; // Reference value of surface integral, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0d+00. xciref = 2.6030925604886277e+01; } // Check for CLASS = B. else if (nx0 == 102 && ny0 == 102 && nz0 == 102 && itmax == 250) { class = 'B'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the (102X102X102) grid, // after 250 time steps, with DT = 2.0d+00 xcrref[0] = 3.5532672969982736e+03; xcrref[1] = 2.6214750795310692e+02; xcrref[2] = 8.8333721850952190e+02; xcrref[3] = 7.7812774739425265e+02; xcrref[4] = 7.3087969592545314e+03; // Reference values of RMS-norms of solution error, for the (102X102X102) // grid, after 250 time steps, with DT = 2.0d+00 xceref[0] = 1.1401176380212709e+02; xceref[1] = 8.1098963655421574e+00; xceref[2] = 2.8480597317698308e+01; xceref[3] = 2.5905394567832939e+01; xceref[4] = 2.6054907504857413e+02; // Reference value of surface integral, for the (102X102X102) grid, // after 250 time steps, with DT = 2.0d+00 xciref = 4.7887162703308227e+01; } // Check for CLASS = C. else if (nx0 == 162 && ny0 == 162 && nz0 == 162 && itmax == 250) { class = 'C'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the (162X162X162) grid, // after 250 time steps, with DT = 2.0d+00 xcrref[0] = 1.03766980323537846e+04; xcrref[1] = 8.92212458801008552e+02; xcrref[2] = 2.56238814582660871e+03; xcrref[3] = 2.19194343857831427e+03; xcrref[4] = 1.78078057261061185e+04; // Reference values of RMS-norms of solution error, for the (162X162X162) // grid, after 250 time steps, with DT = 2.0d+00 xceref[0] = 2.15986399716949279e+02; xceref[1] = 1.55789559239863600e+01; xceref[2] = 5.41318863077207766e+01; xceref[3] = 4.82262643154045421e+01; xceref[4] = 4.55902910043250358e+02; // Reference value of surface integral, for the (162X162X162) grid, // after 250 time steps, with DT = 2.0d+00 xciref = 6.66404553572181300e+01; } // Check for CLASS = D. else if (nx0 == 408 && ny0 == 408 && nz0 == 408 && itmax == 300) { class = 'D'; dtref = 1.0e+0; // Reference values of RMS-norms of residual, for the (408X408X408) grid, // after 300 time steps, with DT = 1.0d+00 xcrref[0] = 0.4868417937025e+05; xcrref[1] = 0.4696371050071e+04; xcrref[2] = 0.1218114549776e+05; xcrref[3] = 0.1033801493461e+05; xcrref[4] = 0.7142398413817e+05; // Reference values of RMS-norms of solution error, for the (408X408X408) // grid, after 300 time steps, with DT = 1.0d+00 xceref[0] = 0.3752393004482e+03; xceref[1] = 0.3084128893659e+02; xceref[2] = 0.9434276905469e+02; xceref[3] = 0.8230686681928e+02; xceref[4] = 0.7002620636210e+03; // Reference value of surface integral, for the (408X408X408) grid, // after 300 time steps, with DT = 1.0d+00 xciref = 0.8334101392503e+02; } else if (nx0 == 1020 && ny0 == 1020 && nz0 == 1020) { class = 'E'; dtref = 0.5e+00; // Reference values of RMS-norms of residual. xcrref[0] = 0.2099641687874e+06; xcrref[1] = 0.2130403143165e+05; xcrref[2] = 0.5319228789371e+05; xcrref[3] = 0.4509761639833e+05; xcrref[4] = 0.2932360006590e+06; // Reference values of RMS-norms of solution error. xceref[0] = 0.4800572578333e+03; xceref[1] = 0.4221993400184e+02; xceref[2] = 0.1210851906824e+03; xceref[3] = 0.1047888986770e+03; xceref[4] = 0.8363028257389e+03; // Reference value of surface integral. xciref = 0.9512163272273e+02; } else { verified = 0; } // Compute the difference of solution values and the known reference values. for (m = 0; m < 5; m++) { xcrdif[m] = fabs( (xcr[m] - xcrref[m]) / xcrref[m] ); xcedif[m] = fabs( (xce[m] - xceref[m]) / xceref[m] ); } xcidif = fabs( (xci - xciref) / xciref ); // Output the comparison of computed results to known cases. if (class != 'U') { printf(" Verification being performanced for class %c.\n", class); printf(" Accuracy setting for epsilon = %20.13E.\n", epsilon); if ( fabs(dt - dtref) > epsilon) { verified = 0; class = 'U'; printf(" DT does not match the reference value of %20.13E.\n", dtref); } } else { printf(" Unknown class.\n"); } if (class != 'U') { printf(" Comparison of RMS-norms of residual.\n"); } else { printf(" RMS-norms of residual.\n"); } for (m = 0; m < 5; m++) { if (class == 'U') { printf(" %d\t%20.13E\n", m, xcr[m]); } else if ( xcrdif[m] > epsilon ) { verified = 0; printf(" FAILURE: %d\t%20.13E, %20.13E, %20.13E\n", m, xcr[m], xcrref[m], xcrdif[m]); } else { printf(" %d\t%20.13E, %20.13E, %20.13E\n", m, xcr[m], xcrref[m], xcrdif[m]); } } if (class != 'U') { printf(" Comparison of RMS-norms of solution error.\n"); } else { printf(" RMS-norms of solution error.\n"); } for (m = 0; m < 5; m++) { if (class == 'U') { printf(" %d\t%20.13E\n", m, xce[m]); } else if ( xcedif[m] <= epsilon ) { printf(" %d\t%20.13E, %20.13E, %20.13E\n", m, xce[m], xceref[m], xcedif[m]); } else { verified = 0; printf(" FAILURE: %d\t%20.13E, %20.13E, %20.13E\n", m, xce[m], xceref[m], xcedif[m]); } } if (class != 'U') { printf(" Comparison of surface integral.\n"); } else { printf(" Surface integral.\n"); } if (class == 'U') { printf(" %20.13E\n", xci); } else if ( xcidif <= epsilon ) { printf(" %20.13E, %20.13E, %20.13E\n", xci, xciref, xcidif); } else { verified = 0; printf(" FAILURE: %20.13E, %20.13E, %20.13E\n", xci, xciref, xcidif); } if (class == 'U') { printf(" No reference values provided.\n"); printf(" No verification performed.\n"); } else if (verified) { printf(" Verification SUCCESSFUL.\n"); } else { printf(" Verification FAILED.\n"); } return verified; }