headers/0000755000175600017620000000000011573566600010665 5ustar sjpsjpkernels/0000755000175600017620000000000011561537724010717 5ustar sjpsjpkernels/vector2/0000755000175600017620000000000011553016033012265 5ustar sjpsjpkernels/vector/0000755000175600017620000000000011570740306012211 5ustar sjpsjpkernels/scalar/0000755000175600017620000000000011570740255012157 5ustar sjpsjpkernels/vector2/bak/0000755000175600017620000000000011553030665013031 5ustar sjpsjpkernels/vector2/.svn/0000755000175600017620000000000011551607762013166 5ustar sjpsjpkernels/vector2/rhs/0000755000175600017620000000000011553015413013062 5ustar sjpsjpkernels/vector/bak/0000755000175600017620000000000011553016261012743 5ustar sjpsjpkernels/vector/.svn/0000755000175600017620000000000011753220711013071 5ustar sjpsjpkernels/vector/rhs/0000755000175600017620000000000011553620343013004 5ustar sjpsjpkernels/scalar/rhs/0000755000175600017620000000000011553332316012747 5ustar sjpsjpkernels/scalar/.svn/0000755000175600017620000000000011753220711013034 5ustar sjpsjpkernels/vector2/.svn/props/0000755000175600017620000000000011542404476014326 5ustar sjpsjpkernels/vector2/.svn/prop-base/0000755000175600017620000000000011542404476015053 5ustar sjpsjpkernels/vector2/.svn/tmp/0000755000175600017620000000000011551607761013765 5ustar sjpsjpkernels/vector2/.svn/text-base/0000755000175600017620000000000011546346005015054 5ustar sjpsjpkernels/vector2/rhs/zeta/0000755000175600017620000000000011546346005014033 5ustar sjpsjpkernels/vector2/rhs/xi/0000755000175600017620000000000011553015412013501 5ustar sjpsjpkernels/vector2/rhs/eta/0000755000175600017620000000000011553015574013643 5ustar sjpsjpkernels/vector2/rhs/.svn/0000755000175600017620000000000011551607762013762 5ustar sjpsjpkernels/vector/bak/rhs/0000755000175600017620000000000011553016261013537 5ustar sjpsjpkernels/vector/.svn/props/0000755000175600017620000000000011553617365014251 5ustar sjpsjpkernels/vector/.svn/prop-base/0000755000175600017620000000000011553617365014776 5ustar sjpsjpkernels/vector/.svn/tmp/0000755000175600017620000000000011753220711013671 5ustar sjpsjpkernels/vector/.svn/text-base/0000755000175600017620000000000011570740336014774 5ustar sjpsjpkernels/vector/rhs/xi2/0000755000175600017620000000000011553620066013510 5ustar sjpsjpkernels/vector/rhs/xi/0000755000175600017620000000000011553620276013431 5ustar sjpsjpkernels/vector/rhs/zeta2/0000755000175600017620000000000011553620112014023 5ustar sjpsjpkernels/vector/rhs/eta2/0000755000175600017620000000000011553620074013640 5ustar sjpsjpkernels/vector/rhs/zeta/0000755000175600017620000000000011553620277013755 5ustar sjpsjpkernels/vector/rhs/eta/0000755000175600017620000000000011553620300013546 5ustar sjpsjpkernels/vector/rhs/.svn/0000755000175600017620000000000011753220711013665 5ustar sjpsjpkernels/scalar/rhs/zeta/0000755000175600017620000000000011553326153013714 5ustar sjpsjpkernels/scalar/rhs/xi/0000755000175600017620000000000011570737742013402 5ustar sjpsjpkernels/scalar/rhs/eta/0000755000175600017620000000000011553326301013515 5ustar sjpsjpkernels/scalar/rhs/.svn/0000755000175600017620000000000011753220711013630 5ustar sjpsjpkernels/scalar/.svn/tmp/0000755000175600017620000000000011753220711013634 5ustar sjpsjpkernels/scalar/.svn/text-base/0000755000175600017620000000000011570740336014737 5ustar sjpsjpkernels/scalar/.svn/props/0000755000175600017620000000000011542404474014205 5ustar sjpsjpkernels/scalar/.svn/prop-base/0000755000175600017620000000000011542404474014732 5ustar sjpsjpkernels/vector2/.svn/tmp/text-base/0000755000175600017620000000000011545060264015653 5ustar sjpsjpkernels/vector2/.svn/tmp/props/0000755000175600017620000000000011542404476015126 5ustar sjpsjpkernels/vector2/.svn/tmp/prop-base/0000755000175600017620000000000011542404476015653 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/0000755000175600017620000000000011551607762014725 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/0000755000175600017620000000000011551607762014402 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/0000755000175600017620000000000011551607762014533 5ustar sjpsjpkernels/vector2/rhs/.svn/tmp/0000755000175600017620000000000011551607761014561 5ustar sjpsjpkernels/vector2/rhs/.svn/text-base/0000755000175600017620000000000011546346006015651 5ustar sjpsjpkernels/vector2/rhs/.svn/props/0000755000175600017620000000000011542404476015122 5ustar sjpsjpkernels/vector2/rhs/.svn/prop-base/0000755000175600017620000000000011542404476015647 5ustar sjpsjpkernels/vector/bak/rhs/eta/0000755000175600017620000000000011553015574014316 5ustar sjpsjpkernels/vector/bak/rhs/.svn/0000755000175600017620000000000011551607762014435 5ustar sjpsjpkernels/vector/bak/rhs/zeta/0000755000175600017620000000000011546346005014506 5ustar sjpsjpkernels/vector/bak/rhs/xi/0000755000175600017620000000000011553015412014154 5ustar sjpsjpkernels/vector/.svn/tmp/text-base/0000755000175600017620000000000011553617675015606 5ustar sjpsjpkernels/vector/.svn/tmp/props/0000755000175600017620000000000011553617364015050 5ustar sjpsjpkernels/vector/.svn/tmp/prop-base/0000755000175600017620000000000011553617364015575 5ustar sjpsjpkernels/vector/rhs/xi/.svn/0000755000175600017620000000000011753220711014305 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/0000755000175600017620000000000011753220711014630 5ustar sjpsjpkernels/vector/rhs/eta/.svn/0000755000175600017620000000000011753220711014436 5ustar sjpsjpkernels/vector/rhs/.svn/tmp/0000755000175600017620000000000011753220711014465 5ustar sjpsjpkernels/vector/rhs/.svn/text-base/0000755000175600017620000000000011553620402015560 5ustar sjpsjpkernels/vector/rhs/.svn/props/0000755000175600017620000000000011553617407015042 5ustar sjpsjpkernels/vector/rhs/.svn/prop-base/0000755000175600017620000000000011553617407015567 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/0000755000175600017620000000000011753220711014573 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/0000755000175600017620000000000011753220711014250 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/0000755000175600017620000000000011753220711014401 5ustar sjpsjpkernels/scalar/rhs/.svn/tmp/0000755000175600017620000000000011753220711014430 5ustar sjpsjpkernels/scalar/rhs/.svn/text-base/0000755000175600017620000000000011553617675015545 5ustar sjpsjpkernels/scalar/rhs/.svn/props/0000755000175600017620000000000011542404475015002 5ustar sjpsjpkernels/scalar/rhs/.svn/prop-base/0000755000175600017620000000000011542404475015527 5ustar sjpsjpkernels/scalar/.svn/tmp/text-base/0000755000175600017620000000000011545060264015534 5ustar sjpsjpkernels/scalar/.svn/tmp/props/0000755000175600017620000000000011542404474015005 5ustar sjpsjpkernels/scalar/.svn/tmp/prop-base/0000755000175600017620000000000011542404474015532 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/tmp/0000755000175600017620000000000011551607761015524 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/text-base/0000755000175600017620000000000011546346007016615 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/props/0000755000175600017620000000000011542404477016066 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/prop-base/0000755000175600017620000000000011542404477016613 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/tmp/0000755000175600017620000000000011551607760015200 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/text-base/0000755000175600017620000000000011546346007016272 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/props/0000755000175600017620000000000011542404476015542 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/prop-base/0000755000175600017620000000000011542404476016267 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/tmp/0000755000175600017620000000000011551607761015332 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/text-base/0000755000175600017620000000000011546346006016422 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/props/0000755000175600017620000000000011542404477015674 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/prop-base/0000755000175600017620000000000011542404477016421 5ustar sjpsjpkernels/vector2/rhs/.svn/tmp/text-base/0000755000175600017620000000000011545060264016447 5ustar sjpsjpkernels/vector2/rhs/.svn/tmp/props/0000755000175600017620000000000011542404476015722 5ustar sjpsjpkernels/vector2/rhs/.svn/tmp/prop-base/0000755000175600017620000000000011542404476016447 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/0000755000175600017620000000000011551607762015206 5ustar sjpsjpkernels/vector/bak/rhs/.svn/props/0000755000175600017620000000000011542404476015575 5ustar sjpsjpkernels/vector/bak/rhs/.svn/prop-base/0000755000175600017620000000000011542404476016322 5ustar sjpsjpkernels/vector/bak/rhs/.svn/tmp/0000755000175600017620000000000011551607761015234 5ustar sjpsjpkernels/vector/bak/rhs/.svn/text-base/0000755000175600017620000000000011546346006016324 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/0000755000175600017620000000000011551607762015400 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/0000755000175600017620000000000011551607762015055 5ustar sjpsjpkernels/vector/rhs/xi/.svn/tmp/0000755000175600017620000000000011753220710015104 5ustar sjpsjpkernels/vector/rhs/xi/.svn/text-base/0000755000175600017620000000000011553620402016200 5ustar sjpsjpkernels/vector/rhs/xi/.svn/props/0000755000175600017620000000000011553620275015457 5ustar sjpsjpkernels/vector/rhs/xi/.svn/prop-base/0000755000175600017620000000000011553620275016204 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/tmp/0000755000175600017620000000000011753220711015430 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/text-base/0000755000175600017620000000000011553620277016535 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/props/0000755000175600017620000000000011553620276016003 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/prop-base/0000755000175600017620000000000011553620276016530 5ustar sjpsjpkernels/vector/rhs/eta/.svn/tmp/0000755000175600017620000000000011753220711015236 5ustar sjpsjpkernels/vector/rhs/eta/.svn/text-base/0000755000175600017620000000000011553620401016330 5ustar sjpsjpkernels/vector/rhs/eta/.svn/props/0000755000175600017620000000000011553620277015612 5ustar sjpsjpkernels/vector/rhs/eta/.svn/prop-base/0000755000175600017620000000000011553620277016337 5ustar sjpsjpkernels/vector/rhs/.svn/tmp/text-base/0000755000175600017620000000000011553620402016360 5ustar sjpsjpkernels/vector/rhs/.svn/tmp/props/0000755000175600017620000000000011553617407015642 5ustar sjpsjpkernels/vector/rhs/.svn/tmp/prop-base/0000755000175600017620000000000011553617407016367 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/tmp/0000755000175600017620000000000011753220711015373 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/text-base/0000755000175600017620000000000011546346007016476 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/props/0000755000175600017620000000000011542404476015746 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/prop-base/0000755000175600017620000000000011542404475016472 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/tmp/0000755000175600017620000000000011753220711015050 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/text-base/0000755000175600017620000000000011546346007016153 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/props/0000755000175600017620000000000011542404475015422 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/prop-base/0000755000175600017620000000000011542404475016147 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/tmp/0000755000175600017620000000000011753220711015201 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/text-base/0000755000175600017620000000000011546346007016304 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/props/0000755000175600017620000000000011542404475015553 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/prop-base/0000755000175600017620000000000011542404475016300 5ustar sjpsjpkernels/scalar/rhs/.svn/tmp/text-base/0000755000175600017620000000000011553617675016345 5ustar sjpsjpkernels/scalar/rhs/.svn/tmp/props/0000755000175600017620000000000011542404475015602 5ustar sjpsjpkernels/scalar/rhs/.svn/tmp/prop-base/0000755000175600017620000000000011542404475016327 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/tmp/text-base/0000755000175600017620000000000011542404570017411 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/tmp/props/0000755000175600017620000000000011542404477016666 5ustar sjpsjpkernels/vector2/rhs/zeta/.svn/tmp/prop-base/0000755000175600017620000000000011542404477017413 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/tmp/text-base/0000755000175600017620000000000011545060264017067 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/tmp/props/0000755000175600017620000000000011542404477016343 5ustar sjpsjpkernels/vector2/rhs/xi/.svn/tmp/prop-base/0000755000175600017620000000000011542404477017070 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/tmp/text-base/0000755000175600017620000000000011545060264017220 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/tmp/props/0000755000175600017620000000000011542404477016474 5ustar sjpsjpkernels/vector2/rhs/eta/.svn/tmp/prop-base/0000755000175600017620000000000011542404477017221 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/props/0000755000175600017620000000000011542404477016347 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/prop-base/0000755000175600017620000000000011542404477017074 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/tmp/0000755000175600017620000000000011551607761016005 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/text-base/0000755000175600017620000000000011546346006017075 5ustar sjpsjpkernels/vector/bak/rhs/.svn/tmp/text-base/0000755000175600017620000000000011545060264017122 5ustar sjpsjpkernels/vector/bak/rhs/.svn/tmp/props/0000755000175600017620000000000011542404476016375 5ustar sjpsjpkernels/vector/bak/rhs/.svn/tmp/prop-base/0000755000175600017620000000000011542404476017122 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/props/0000755000175600017620000000000011542404477016541 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/prop-base/0000755000175600017620000000000011542404477017266 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/tmp/0000755000175600017620000000000011551607761016177 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/text-base/0000755000175600017620000000000011546346007017270 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/props/0000755000175600017620000000000011542404476016215 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/prop-base/0000755000175600017620000000000011542404476016742 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/tmp/0000755000175600017620000000000011551607760015653 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/text-base/0000755000175600017620000000000011546346007016745 5ustar sjpsjpkernels/vector/rhs/xi/.svn/tmp/text-base/0000755000175600017620000000000011553620402017000 5ustar sjpsjpkernels/vector/rhs/xi/.svn/tmp/props/0000755000175600017620000000000011553620275016257 5ustar sjpsjpkernels/vector/rhs/xi/.svn/tmp/prop-base/0000755000175600017620000000000011553620275017004 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/tmp/text-base/0000755000175600017620000000000011553620277017335 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/tmp/props/0000755000175600017620000000000011553620276016603 5ustar sjpsjpkernels/vector/rhs/zeta/.svn/tmp/prop-base/0000755000175600017620000000000011553620276017330 5ustar sjpsjpkernels/vector/rhs/eta/.svn/tmp/text-base/0000755000175600017620000000000011553620401017130 5ustar sjpsjpkernels/vector/rhs/eta/.svn/tmp/props/0000755000175600017620000000000011553620277016412 5ustar sjpsjpkernels/vector/rhs/eta/.svn/tmp/prop-base/0000755000175600017620000000000011553620277017137 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/tmp/text-base/0000755000175600017620000000000011542404571017273 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/tmp/props/0000755000175600017620000000000011542404476016546 5ustar sjpsjpkernels/scalar/rhs/zeta/.svn/tmp/prop-base/0000755000175600017620000000000011542404476017273 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/tmp/text-base/0000755000175600017620000000000011542404570016747 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/tmp/props/0000755000175600017620000000000011542404475016222 5ustar sjpsjpkernels/scalar/rhs/xi/.svn/tmp/prop-base/0000755000175600017620000000000011542404475016747 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/tmp/text-base/0000755000175600017620000000000011542404567017106 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/tmp/props/0000755000175600017620000000000011542404475016353 5ustar sjpsjpkernels/scalar/rhs/eta/.svn/tmp/prop-base/0000755000175600017620000000000011542404475017100 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/tmp/text-base/0000755000175600017620000000000011545060264017673 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/tmp/props/0000755000175600017620000000000011542404477017147 5ustar sjpsjpkernels/vector/bak/rhs/eta/.svn/tmp/prop-base/0000755000175600017620000000000011542404477017674 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/tmp/text-base/0000755000175600017620000000000011542404570020064 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/tmp/props/0000755000175600017620000000000011542404477017341 5ustar sjpsjpkernels/vector/bak/rhs/zeta/.svn/tmp/prop-base/0000755000175600017620000000000011542404477020066 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/tmp/text-base/0000755000175600017620000000000011545060264017542 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/tmp/props/0000755000175600017620000000000011542404477017016 5ustar sjpsjpkernels/vector/bak/rhs/xi/.svn/tmp/prop-base/0000755000175600017620000000000011542404477017543 5ustar sjpsjpheaders/wcl.h0000644000175600017620000000174011535403076011620 0ustar sjpsjp#ifndef __WCL_H__ #define __WCL_H__ #include // Global variables. extern cl_device_id wclDevice; extern cl_context wclContext; extern cl_command_queue wclCommands; extern cl_platform_id wclPlatform; extern cl_program wclProgram; // Library initialisation/cleanup routines. // TODO: Add support for multiple devices/command queues. extern void wclInit(cl_device_type device_type);/*, int kernels, int buffers);*/ extern void wclGetPlatform(const char* platform_name); extern void wclCleanup(); // Kernel creation routines. extern char* wclLoadProgramSource(const char* filename); /*extern int wclRegisterKernel(cl_kernel kernel); // Buffer creation routines. extern int wclRegisterBuffer(cl_mem buffer);*/ // Error handling routines. extern char* wclErrorString(cl_int status); extern int wclCheckStatus(cl_int status, cl_int desired, const char* message); extern void wclCheckError(cl_int status, cl_int desired, const char* message); #endif headers/util.h0000644000175600017620000000042211535457500012005 0ustar sjpsjp#ifndef __UTIL_H__ #define __UTIL_H__ #include "applu.h" #include "timers.h" #define min(x,y) (x < y ? x : y) #define max(x,y) (x > y ? x : y) extern void parse_options(int argc, char* argv[]); extern void allocate_buffers(); extern void free_buffers(); #endif headers/timers.h0000644000175600017620000000140711551615652012341 0ustar sjpsjp#ifndef __TIMERS_H__ #define __TIMERS_H__ /** * Timer struct. */ typedef struct { double cpu_start; double cpu_end; double wall_start; double wall_end; double cpu_total; double wall_total; int calls; } Timer; extern void timer(double* cpu, double* et); extern void timer_init(int number); extern void timer_reset(); extern void timer_start(int i); extern void timer_stop(int i); extern void timer_pause(int i); extern void timer_restart(int i); extern void timer_finalize(); extern double timer_cpu_total(int i); extern double timer_cpu_avg(int i); extern double timer_wall_total(int i); extern double timer_wall_avg(int i); extern void timer_print_format(); extern void timer_print(const char* name, int i); #endif headers/size.h0000644000175600017620000000067011573566600012013 0ustar sjpsjp/** * Define the problem and sub-problem sizes. */ // Number of processors compiled for. #define nnodes_compiled 1 // Full array size. #define isiz01 162 #define isiz02 162 #define isiz03 162 // Array size per processor. #define isiz1 162 #define isiz2 162 #define isiz3 isiz03 // Number of iterations, etc. #define itmax_default 250 #define inorm_default 250 #define dt_default 2.0e+00 #define kblock 162 #define problem_height 162 headers/mpinpb.h0000755000175600017620000000021211535242046012312 0ustar sjpsjp/** * Shared header file for anything using MPI. */ #include extern int node, no_nodes, root, comm_setup, comm_solve, comm_rhs; headers/globals.h0000644000175600017620000000236311561525451012461 0ustar sjpsjp/** * Definition of global variables defined by applu.h * * Note: This file should ONLY be included by main.c. * "#include applu.h" is sufficient to access these variables. */ int verified; double mflops; char class; int nx, ny, nz; int nx0, ny0, nz0; int ipt, ist, iend; int jpt, jst, jend; int ii1, ii2; int ji1, ji2; int ki1, ki2; double dxi, deta, dzeta; double tx1, tx2, tx3; double ty1, ty2, ty3; double tz1, tz2, tz3; double dx1, dx2, dx3, dx4, dx5; double dy1, dy2, dy3, dy4, dy5; double dz1, dz2, dz3, dz4, dz5; double dssp; double *u, *rsd, *frct, *flux; int ipr, inorm; int itmax, invert; double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal; double ce[13][5]; int id, ndim, num, xdim, ydim, row, col; int north, south, east, west; int icommn[npmax+1], icomms[npmax+1], icomme[npmax+1], icommw[npmax+1]; double *buf, *buf1; double *ibuf, *jbuf; double walltime, cputime; // Some MPI stuff. int root; int rhsblock[3]; int rhsgrid[3]; int waveblock[3]; int wavegrid[3]; int ex1iblock[3]; int ex1igrid[3]; int ex1jblock[3]; int ex1jgrid[3]; int ex3iblock[3]; int ex3igrid[3]; int ex3jblock[3]; int ex3jgrid[3]; int opt_platform, opt_device, opt_distribution, opt_kernels, opt_layout, opt_blocking, opt_fission; size_t max_buffer_size; headers/applu_cl.h0000644000175600017620000001053211553632273012634 0ustar sjpsjp/** * Header for useful OpenCL utility functions. */ #ifndef __APPLU_CL_H__ #define __APPLU_CL_H__ #include #include "wcl.h" #include #include #include // Definition of the OpenCL kernels. extern cl_kernel blts_kernel; extern cl_kernel buts_kernel; extern cl_kernel l2norm_kernel; extern cl_kernel pre_kernel; extern cl_kernel post_kernel; extern cl_kernel rhs_setup_kernel; extern cl_kernel rhs_xi_kernel; extern cl_kernel rhs_eta_kernel; extern cl_kernel rhs_zeta_kernel; extern cl_kernel rhs_xi1_kernel; extern cl_kernel rhs_xi2_kernel; extern cl_kernel rhs_xi3_kernel; extern cl_kernel rhs_xi4_kernel; extern cl_kernel rhs_xi_dissipation_kernel; extern cl_kernel rhs_eta1_kernel; extern cl_kernel rhs_eta2_kernel; extern cl_kernel rhs_eta3_kernel; extern cl_kernel rhs_eta4_kernel; extern cl_kernel rhs_eta_dissipation_kernel; extern cl_kernel rhs_zeta1_kernel; extern cl_kernel rhs_zeta2_kernel; extern cl_kernel rhs_zeta3_kernel; extern cl_kernel rhs_zeta4_kernel; extern cl_kernel rhs_zeta_dissipation_kernel; extern cl_kernel flat_to_hyperplane_kernel; extern cl_kernel hyperplane_to_flat_kernel; extern cl_kernel flat_to_tiled_kernel; extern cl_kernel tiled_to_flat_kernel; extern cl_kernel tiled_to_hyperplane_kernel; extern cl_kernel hyperplane_to_tiled_kernel; extern cl_kernel ex1_unpack_north_kernel; extern cl_kernel ex1_unpack_west_kernel; extern cl_kernel ex1_unpack_south_kernel; extern cl_kernel ex1_unpack_east_kernel; extern cl_kernel ex1_pack_north_kernel; extern cl_kernel ex1_pack_west_kernel; extern cl_kernel ex1_pack_south_kernel; extern cl_kernel ex1_pack_east_kernel; extern cl_kernel ex3_unpack_north_kernel; extern cl_kernel ex3_unpack_west_kernel; extern cl_kernel ex3_unpack_south_kernel; extern cl_kernel ex3_unpack_east_kernel; extern cl_kernel ex3_pack_north_kernel; extern cl_kernel ex3_pack_west_kernel; extern cl_kernel ex3_pack_south_kernel; extern cl_kernel ex3_pack_east_kernel; extern cl_kernel memset_double_kernel; extern cl_kernel print_mem_kernel; // Pointers to the memory. extern cl_mem u_d; extern cl_mem rsd_d; extern cl_mem frct_d; extern cl_mem flux_d; extern cl_mem sum_d; extern cl_mem wavefront_offsets_2d_d; extern cl_mem wavefront_offsets_3d_d; extern cl_mem columns_d; extern cl_mem rows_d; extern cl_mem thread_mapping_d; extern cl_mem rearrangement_d; extern cl_mem ibuffer_d; extern cl_mem jbuffer_d; extern cl_mem buf_d; extern cl_mem buf1_d; // Some convenience variables for fission. extern cl_device_id subDevice; extern cl_context subContext; extern cl_command_queue subQueue; // Prototype functions. extern void swap_pointers(cl_mem* x, cl_mem* y); extern void ssor_cl(int niter); extern void blts_cl(int starting_k, cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_lengths_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping); extern void buts_cl(int starting_k, cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_lengths_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping); extern void blts_cl_new(cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_lengths_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping); extern void buts_cl_new(cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_lengths_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping); extern void l2norm_cl(int nz0, double* sum_h, cl_mem rsd, cl_mem sum_d); extern void post_cl(double tmp, cl_mem u, cl_mem rsd); extern void pre_cl(cl_mem rsd); extern void rhs_cl(cl_mem u, cl_mem rsd, cl_mem frct, cl_mem flux); extern void build_kernels(); extern void free_kernels(); extern void allocate_cl_buffers(); extern void free_cl_buffers(); extern void prepare_lookup_tables(int* wavefront_offsets_2d, int* wavefront_offsets_3d, int* columns, int* rows, int* thread_mapping); extern void flat_to_hyperplane(cl_mem flat, cl_mem hyperplane); extern void hyperplane_to_flat(cl_mem hyperplane, cl_mem flat); extern void tiled_to_hyperplane(cl_mem tiled, cl_mem hyperplane); extern void hyperplane_to_tiled(cl_mem hyperplane, cl_mem tiled); extern void flat_to_tiled(cl_mem flat, cl_mem tiled); extern void tiled_to_flat(cl_mem tiled, cl_mem flat); extern void exchange_1_cl(int k, int iex); extern void exchange_3_cl(int iex); #endif headers/applu.h0000755000175600017620000001047611561531130012155 0ustar sjpsjp#ifndef __APPLU_H__ #define __APPLU_H__ // size.h defines problem and decomposition sizes. #include "size.h" #include /** * isiz01,02,03 give maximum size. * ipr = 1 to print out verbose information. * omega = 2.0 is correct for all classes. * tolrsd is tolerance levels for steady state residuals. */ #define ipr_default 1 #define omega_default (1.20) #define tolrsd1_def (1.0e-08) #define tolrsd2_def (1.0e-08) #define tolrsd3_def (1.0e-08) #define tolrsd4_def (1.0e-08) #define tolrsd5_def (1.0e-08) #define c1 (1.40e+00) #define c2 (0.40e+00) #define c3 (1.00e-01) #define c4 (1.00e+00) #define c5 (1.40e+00) /** * Grid. */ extern int nx, ny, nz; extern int nx0, ny0, nz0; extern int ipt, ist, iend; extern int jpt, jst, jend; extern int ii1, ii2; extern int ji1, ji2; extern int ki1, ki2; extern double dxi, deta, dzeta; extern double tx1, tx2, tx3; extern double ty1, ty2, ty3; extern double tz1, tz2, tz3; /** * Dissipation. */ extern double dx1, dx2, dx3, dx4, dx5; extern double dy1, dy2, dy3, dy4, dy5; extern double dz1, dz2, dz3, dz4, dz5; extern double dssp; /** * Field variables and residuals. */ extern double* u; extern double* rsd; extern double* frct; extern double* flux; #define u(k,j,i,m) (u[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) #define rsd(k,j,i,m) (rsd[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) #define frct(k,j,i,m) (frct[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) #define flux(k,j,i,m) (flux[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) /** * Output control parameters. */ extern int ipr, inorm; /** * Newton-raphson iteration control parameters. */ extern int itmax, invert; extern double dt, omega, tolrsd[5], rsdnm[5], errnm[5], frc, ttotal; /** * Coefficients of the exact solution. */ extern double ce[13][5]; /** * Multi-processor common blocks. */ extern int id, ndim, num, xdim, ydim, row, col; extern int north, south, east, west; #define from_s 1 #define from_n 2 #define from_e 3 #define from_w 4 #define npmax (isiz01 + isiz02) extern int icommn[npmax+1], icomms[npmax+1], icomme[npmax+1], icommw[npmax+1]; extern double *buf, *buf1; #define buf(j,i) (buf[(j)*5 + (i)]) #define buf1(j,i) (buf1[(j)*5 + (i)]) extern double *jbuf, *ibuf; extern double walltime, cputime; extern size_t max_buffer_size; /** * Some variables for controlling the sizes of the workgroups. */ extern int rhsblock[3]; extern int rhsgrid[3]; extern int waveblock[3]; extern int wavegrid[3]; extern int ex1iblock[3]; extern int ex1igrid[3]; extern int ex1jblock[3]; extern int ex1jgrid[3]; extern int ex3iblock[3]; extern int ex3igrid[3]; extern int ex3jblock[3]; extern int ex3jgrid[3]; /** * Some variables for controlling CPU/OpenCL execution. */ #define APPLU_DEVICE_CPU 0 #define APPLU_DEVICE_GPU 1 extern int opt_device; #define APPLU_DISTRIBUTION_FINE 0 #define APPLU_DISTRIBUTION_COARSE 1 extern int opt_distribution; #define APPLU_KERNELS_SCALAR 0 #define APPLU_KERNELS_VECTOR 1 extern int opt_kernels; #define APPLU_PLATFORM_AMD 0 #define APPLU_PLATFORM_NVIDIA 1 #define APPLU_PLATFORM_INTEL 2 extern int opt_platform; #define APPLU_LAYOUT_AOS 0 #define APPLU_LAYOUT_SOA 1 extern int opt_layout; #define APPLU_BLOCKING_OLD 0 #define APPLU_BLOCKING_NEW 1 extern int opt_blocking; #define APPLU_FISSION_OFF 0 #define APPLU_FISSION_ON 1 extern int opt_fission; // Function prototypes. extern void bcast_inputs(); extern void erhs(); extern void error(); extern void exact(int i, int j, int k, double u000ijk[5]); extern void exchange_1(double* g, int k, int iex); extern void exchange_3(double* g, int iex); extern void exchange_4(double** g, double** h, int ibeg, int ifin1, int jbeg, int jfin1); extern void exchange_5(double** g, int ibeg, int ifin1); extern void exchange_6(double** g, int jbeg, int jfin1); extern void init_comm(int argc, char** argv); extern void read_input(); extern void neighbors(); extern int nodedim(int num); extern void pintgr(); extern void print_timers(); extern void proc_grid(); extern void setbv(); extern void setcoeff(); extern void setiv(); extern void sethyper(); extern void subdomain(); extern int verify(double xcr[5], double xce[5], double xci, char class); #endif kernels/soa.clh0000644000175600017620000000544711545105447012176 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { #ifdef APPLU_BLOCKING_OLD int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump enough blocks. int block_depth = k / kblock; int depth = k - (kblock * block_depth); offset += block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock); // Jump to the right wavefront. offset += wave_offset_3d[i + j + depth]; // Update thread_offset. if ( (i + j + depth) >= kblock - 1 ) { offset = offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; #else //#ifdef APPLU_BLOCKING_NEW int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump to the right wavefront. offset += wave_offset_3d[i + j + k]; // Update thread_offset. if ( (i + j + k) >= isiz3 - 1 ) { offset = offset - wave_offset_2d[(i + j + k) - (isiz3 - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; #endif } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { // Note: Not using the tiled index is beneficial for a few reasons: // 1) Accesses are still coalesced using simple SoA. // 2) 3D grid appears to schedule blocks differently, so there is less reason to try and avoid partition camping. return m * (isiz1+4) * (isiz2+4) * isiz3 + (k * (isiz2 + 4) + j) * (isiz1 + 4) + i; /*int offset = 0; // Add block offset. const int block_i = (i / rhsblock_x); const int block_j = (j / rhsblock_y); const int block_id = (block_j * rhsgrid_x) + block_i; offset += block_id * (rhsblock_x * rhsblock_y * isiz3); // Add thread offset. const int thread_i = i - (block_i * rhsblock_x); const int thread_j = j - (block_j * rhsblock_y); offset += (thread_j * rhsblock_x) + thread_i; // Add k offset. offset += k * (rhsblock_x * rhsblock_y); // Add angle offset. offset += m * (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); // Return. return offset;*/ } // Macro definitions for blts and buts. #define m_offset (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Macro definitions for vector kernels. #define vint int2 #define vdouble double2 #define vlong long2 #define vlength 2 #define vload vload2 #define vstore vstore2 kernels/nvidia.clh0000644000175600017620000000034011536374341012652 0ustar sjpsjp// Pragma required to enable double precision. #pragma OPENCL EXTENSION cl_khr_fp64 : enable #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) kernels/intel.clh0000644000175600017620000000034211561537724012521 0ustar sjpsjp// Pragma required to enable double precision. //#pragma OPENCL EXTENSION cl_khr_fp64 : enable #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) kernels/aos.clh0000644000175600017620000000310511553323533012160 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { #ifdef APPLU_BLOCKING_OLD // Calculate block offset. int block_depth = k / kblock; int depth = k - (kblock * block_depth); int block_offset = block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock * 5); // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + depth]; if ( (i + j + depth) >= kblock - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. return block_offset + (5 * thread_offset) + m; #else //#ifdef APPLU_BLOCKING_NEW // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + k]; if ( (i + j + k) >= isiz3 - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + k) - (isiz3 - 1)]; } // Add angle offset. return (5 * thread_offset) + m; #endif } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return flat_index(k, j, i, m); } // Macro definitions for blts and buts. #define m_offset (1) kernels/amd.clh0000644000175600017620000000050011553043445012134 0ustar sjpsjp// Pragma required to enable double precision. #pragma OPENCL EXTENSION cl_amd_fp64 : enable //#pragma OPENCL EXTENSION cl_khr_fp64 : enable #pragma OPENCL EXTENSION cl_amd_printf : enable #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) kernels/vector2/rearrangement.cl0000644000175600017620000001761711544365677015500 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); hyperplane_output[h_index + 0 * m_offset] = flat_input[f_index + 0]; hyperplane_output[h_index + 1 * m_offset] = flat_input[f_index + 1]; hyperplane_output[h_index + 2 * m_offset] = flat_input[f_index + 2]; hyperplane_output[h_index + 3 * m_offset] = flat_input[f_index + 3]; hyperplane_output[h_index + 4 * m_offset] = flat_input[f_index + 4]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); flat_output[f_index + 0] = hyperplane_input[h_index + 0 * m_offset]; flat_output[f_index + 1] = hyperplane_input[h_index + 1 * m_offset]; flat_output[f_index + 2] = hyperplane_input[h_index + 2 * m_offset]; flat_output[f_index + 3] = hyperplane_input[h_index + 3 * m_offset]; flat_output[f_index + 4] = hyperplane_input[h_index + 4 * m_offset]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = flat_input[f_index + 0]; tiled_output[t_index + 1 * t_offset] = flat_input[f_index + 1]; tiled_output[t_index + 2 * t_offset] = flat_input[f_index + 2]; tiled_output[t_index + 3 * t_offset] = flat_input[f_index + 3]; tiled_output[t_index + 4 * t_offset] = flat_input[f_index + 4]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flat_output[f_index + 0] = tiled_input[t_index + 0 * t_offset]; flat_output[f_index + 1] = tiled_input[t_index + 1 * t_offset]; flat_output[f_index + 2] = tiled_input[t_index + 2 * t_offset]; flat_output[f_index + 3] = tiled_input[t_index + 3 * t_offset]; flat_output[f_index + 4] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); hyperplane_output[h_index + 0 * m_offset] = tiled_input[t_index + 0 * t_offset]; hyperplane_output[h_index + 1 * m_offset] = tiled_input[t_index + 1 * t_offset]; hyperplane_output[h_index + 2 * m_offset] = tiled_input[t_index + 2 * t_offset]; hyperplane_output[h_index + 3 * m_offset] = tiled_input[t_index + 3 * t_offset]; hyperplane_output[h_index + 4 * m_offset] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = hyperplane_input[h_index + 0 * m_offset]; tiled_output[t_index + 1 * t_offset] = hyperplane_input[h_index + 1 * m_offset]; tiled_output[t_index + 2 * t_offset] = hyperplane_input[h_index + 2 * m_offset]; tiled_output[t_index + 3 * t_offset] = hyperplane_input[h_index + 3 * m_offset]; tiled_output[t_index + 4 * t_offset] = hyperplane_input[h_index + 4 * m_offset]; } } } } kernels/vector2/print.cl0000644000175600017620000000061411544123621013744 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/vector2/pre.cl0000644000175600017620000000317311553015436013405 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble dt_v = (vdouble) dt; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; vdouble res = vload(0, rsd + t_index + 0 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 0 * t_offset); res = vload(0, rsd + t_index + 1 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 1 * t_offset); res = vload(0, rsd + t_index + 2 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 2 * t_offset); res = vload(0, rsd + t_index + 3 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 3 * t_offset); res = vload(0, rsd + t_index + 4 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 4 * t_offset); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/vector2/post.cl0000644000175600017620000000364011553015427013603 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble tmp_v = (vdouble) tmp; int index; index = tiled_index(k, j, i, 0); vdouble u_v = vload(0, u + index); vdouble r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 1); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 2); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 3); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 4); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); } for (; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/vector2/l2norm.cl0000644000175600017620000000072311544123621014022 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/vector2/ex3_unpack.cl0000644000175600017620000001044411544123621014652 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/vector2/ex3_pack.cl0000644000175600017620000001031111544123621014300 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/vector2/ex1_unpack.cl0000644000175600017620000001043111544123620014643 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/vector2/ex1_pack.cl0000644000175600017620000001040711544123620014303 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/vector2/buts.cl0000644000175600017620000010135711553015505013574 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void buts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = (starting_k - (kblock - 1)) + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vdouble vn0 = vload(0, v + h_below + 0 * m_offset); vdouble vn1 = vload(0, v + h_below + 1 * m_offset); vdouble vn2 = vload(0, v + h_below + 2 * m_offset); vdouble vn3 = vload(0, v + h_below + 3 * m_offset); vdouble vn4 = vload(0, v + h_below + 4 * m_offset); // Read in u neighbour, for calculation of c. vdouble u0 = vload(0, u + h_below + 0 * m_offset); vdouble u1 = vload(0, u + h_below + 1 * m_offset); vdouble u2 = vload(0, u + h_below + 2 * m_offset); vdouble u3 = vload(0, u + h_below + 3 * m_offset); vdouble u4 = vload(0, u + h_below + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_below + 0 * m_offset); vdouble v0 = omega * ( jacu_c_value_00 * vn0 ); vdouble v1 = omega * ( jacu_c_value_01 * vn0 ); vdouble v2 = omega * ( jacu_c_value_02 * vn0 ); vdouble v3 = omega * ( jacu_c_value_03 * vn0 ); vdouble v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = vload(0, v + h_below + 1 * m_offset); v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = vload(0, v + h_below + 2 * m_offset); v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = vload(0, v + h_below + 3 * m_offset); v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = vload(0, v + h_below + 4 * m_offset); v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_south + 0 * m_offset); vn1 = vload(0, v + h_south + 1 * m_offset); vn2 = vload(0, v + h_south + 2 * m_offset); vn3 = vload(0, v + h_south + 3 * m_offset); vn4 = vload(0, v + h_south + 4 * m_offset); // Read in u neighbour, for calculation of b. u0 = vload(0, u + h_south + 0 * m_offset); u1 = vload(0, u + h_south + 1 * m_offset); u2 = vload(0, u + h_south + 2 * m_offset); u3 = vload(0, u + h_south + 3 * m_offset); u4 = vload(0, u + h_south + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_south + 0 * m_offset); v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = vload(0, v + h_south + 1 * m_offset); v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = vload(0, v + h_south + 2 * m_offset); v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = vload(0, v + h_south + 3 * m_offset); v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = vload(0, v + h_south + 4 * m_offset); v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_east + 0 * m_offset); vn1 = vload(0, v + h_east + 1 * m_offset); vn2 = vload(0, v + h_east + 2 * m_offset); vn3 = vload(0, v + h_east + 3 * m_offset); vn4 = vload(0, v + h_east + 4 * m_offset); // Read in u neighbour, for calculation of a. u0 = vload(0, u + h_east + 0 * m_offset); u1 = vload(0, u + h_east + 1 * m_offset); u2 = vload(0, u + h_east + 2 * m_offset); u3 = vload(0, u + h_east + 3 * m_offset); u4 = vload(0, u + h_east + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_east + 0 * m_offset); v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = vload(0, v + h_east + 1 * m_offset); v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = vload(0, v + h_east + 2 * m_offset); v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = vload(0, v + h_east + 3 * m_offset); v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = vload(0, v + h_east + 4 * m_offset); v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacu_d_value_00; vdouble tmat10 = jacu_d_value_10; vdouble tmat20 = jacu_d_value_20; vdouble tmat30 = jacu_d_value_30; vdouble tmat40 = jacu_d_value_40; vdouble tmat01 = jacu_d_value_01; vdouble tmat11 = jacu_d_value_11; vdouble tmat21 = jacu_d_value_21; vdouble tmat31 = jacu_d_value_31; vdouble tmat41 = jacu_d_value_41; vdouble tmat02 = jacu_d_value_02; vdouble tmat12 = jacu_d_value_12; vdouble tmat22 = jacu_d_value_22; vdouble tmat32 = jacu_d_value_32; vdouble tmat42 = jacu_d_value_42; vdouble tmat03 = jacu_d_value_03; vdouble tmat13 = jacu_d_value_13; vdouble tmat23 = jacu_d_value_23; vdouble tmat33 = jacu_d_value_33; vdouble tmat43 = jacu_d_value_43; vdouble tmat04 = jacu_d_value_04; vdouble tmat14 = jacu_d_value_14; vdouble tmat24 = jacu_d_value_24; vdouble tmat34 = jacu_d_value_34; vdouble tmat44 = jacu_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update v. vdouble old_v; old_v = vload(0, v + h_index + 0 * m_offset); old_v -= v0; vstore(old_v, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); old_v -= v1; vstore(old_v, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); old_v -= v2; vstore(old_v, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); old_v -= v3; vstore(old_v, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); old_v -= v4; vstore(old_v, 0, v + h_index + 4 * m_offset); } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial tidy-up. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector2/blts.cl0000644000175600017620000007767511553015516013604 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void blts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; //if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); /*} else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); //vlong b2 = (vlong) ((long) iv >= (long) ist && (long) iv <= (long) iend && (long) jv >= (long) jst && (long) jv <= (long) jend && (long) kv >= 1 && (long) kv <= nz - 2 && (long) depthv >= (long) 0 && (long) depthv <= (long) kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); }*/ // If there are some elements that don't require an update, iterate through the vector. } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial remainder. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector2/.nfs00000000c06e7660000000480000644000175600017620000000231711544123620015475 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { // Calculate block offset. int block_depth = k / kblock; int depth = k - (kblock * block_depth); int block_offset = block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock * 5); // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + depth]; if ( (i + j + depth) >= kblock - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. return block_offset + (5 * thread_offset) + m; } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return flat_index(k, j, i, m); } // Macro definitions for blts and buts. #define m_offset (1) kernels/vector2/.nfs00000000c02efb42000000440000644000175600017620000000415011544361530015621 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump enough blocks. int block_depth = k / kblock; int depth = k - (kblock * block_depth); offset += block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock); // Jump to the right wavefront. offset += wave_offset_3d[i + j + depth]; // Update thread_offset. if ( (i + j + depth) >= kblock - 1 ) { offset = offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return m * (isiz1+4) * (isiz2+4) * isiz3 + (k * (isiz2 + 4) + j) * (isiz1 + 4) + i; /*int offset = 0; // Add block offset. const int block_i = (i / rhsblock_x); const int block_j = (j / rhsblock_y); const int block_id = (block_j * rhsgrid_x) + block_i; offset += block_id * (rhsblock_x * rhsblock_y * isiz3); // Add thread offset. const int thread_i = i - (block_i * rhsblock_x); const int thread_j = j - (block_j * rhsblock_y); offset += (thread_j * rhsblock_x) + thread_i; // Add k offset. offset += k * (rhsblock_x * rhsblock_y); // Add angle offset. offset += m * (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); // Return. return offset;*/ } // Macro definitions for blts and buts. #define m_offset (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Macro definitions for vector kernels. #define vint int2 #define vdouble double2 #define vlong long2 #define vlength 2 #define vload vload2 #define vstore vstore2 kernels/vector/rearrangement.cl0000644000175600017620000002046511541645670015400 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = flat_input[flat_index(k, j, i, 0)]; tiled_output[tiled_index(k, j, i, 1)] = flat_input[flat_index(k, j, i, 1)]; tiled_output[tiled_index(k, j, i, 2)] = flat_input[flat_index(k, j, i, 2)]; tiled_output[tiled_index(k, j, i, 3)] = flat_input[flat_index(k, j, i, 3)]; tiled_output[tiled_index(k, j, i, 4)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = tiled_input[tiled_index(k, j, i, 0)]; flat_output[flat_index(k, j, i, 1)] = tiled_input[tiled_index(k, j, i, 1)]; flat_output[flat_index(k, j, i, 2)] = tiled_input[tiled_index(k, j, i, 2)]; flat_output[flat_index(k, j, i, 3)] = tiled_input[tiled_index(k, j, i, 3)]; flat_output[flat_index(k, j, i, 4)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } kernels/vector/print.cl0000644000175600017620000000061411535717535013677 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/vector/pre.cl0000644000175600017620000000135211541651721013320 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/vector/post.cl0000644000175600017620000000166111541651754013530 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/vector/l2norm.cl0000644000175600017620000000072311524742105013742 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/vector/ex3_unpack.cl0000644000175600017620000001044411542631035014571 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/vector/ex3_pack.cl0000644000175600017620000001031111542631026014217 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/vector/ex1_unpack.cl0000644000175600017620000001043111542645666014602 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/vector/ex1_pack.cl0000644000175600017620000001040711542645757014243 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/vector/buts.cl0000644000175600017620000010135711553015505013512 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void buts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = (starting_k - (kblock - 1)) + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vdouble vn0 = vload(0, v + h_below + 0 * m_offset); vdouble vn1 = vload(0, v + h_below + 1 * m_offset); vdouble vn2 = vload(0, v + h_below + 2 * m_offset); vdouble vn3 = vload(0, v + h_below + 3 * m_offset); vdouble vn4 = vload(0, v + h_below + 4 * m_offset); // Read in u neighbour, for calculation of c. vdouble u0 = vload(0, u + h_below + 0 * m_offset); vdouble u1 = vload(0, u + h_below + 1 * m_offset); vdouble u2 = vload(0, u + h_below + 2 * m_offset); vdouble u3 = vload(0, u + h_below + 3 * m_offset); vdouble u4 = vload(0, u + h_below + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_below + 0 * m_offset); vdouble v0 = omega * ( jacu_c_value_00 * vn0 ); vdouble v1 = omega * ( jacu_c_value_01 * vn0 ); vdouble v2 = omega * ( jacu_c_value_02 * vn0 ); vdouble v3 = omega * ( jacu_c_value_03 * vn0 ); vdouble v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = vload(0, v + h_below + 1 * m_offset); v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = vload(0, v + h_below + 2 * m_offset); v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = vload(0, v + h_below + 3 * m_offset); v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = vload(0, v + h_below + 4 * m_offset); v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_south + 0 * m_offset); vn1 = vload(0, v + h_south + 1 * m_offset); vn2 = vload(0, v + h_south + 2 * m_offset); vn3 = vload(0, v + h_south + 3 * m_offset); vn4 = vload(0, v + h_south + 4 * m_offset); // Read in u neighbour, for calculation of b. u0 = vload(0, u + h_south + 0 * m_offset); u1 = vload(0, u + h_south + 1 * m_offset); u2 = vload(0, u + h_south + 2 * m_offset); u3 = vload(0, u + h_south + 3 * m_offset); u4 = vload(0, u + h_south + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_south + 0 * m_offset); v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = vload(0, v + h_south + 1 * m_offset); v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = vload(0, v + h_south + 2 * m_offset); v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = vload(0, v + h_south + 3 * m_offset); v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = vload(0, v + h_south + 4 * m_offset); v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_east + 0 * m_offset); vn1 = vload(0, v + h_east + 1 * m_offset); vn2 = vload(0, v + h_east + 2 * m_offset); vn3 = vload(0, v + h_east + 3 * m_offset); vn4 = vload(0, v + h_east + 4 * m_offset); // Read in u neighbour, for calculation of a. u0 = vload(0, u + h_east + 0 * m_offset); u1 = vload(0, u + h_east + 1 * m_offset); u2 = vload(0, u + h_east + 2 * m_offset); u3 = vload(0, u + h_east + 3 * m_offset); u4 = vload(0, u + h_east + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_east + 0 * m_offset); v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = vload(0, v + h_east + 1 * m_offset); v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = vload(0, v + h_east + 2 * m_offset); v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = vload(0, v + h_east + 3 * m_offset); v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = vload(0, v + h_east + 4 * m_offset); v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacu_d_value_00; vdouble tmat10 = jacu_d_value_10; vdouble tmat20 = jacu_d_value_20; vdouble tmat30 = jacu_d_value_30; vdouble tmat40 = jacu_d_value_40; vdouble tmat01 = jacu_d_value_01; vdouble tmat11 = jacu_d_value_11; vdouble tmat21 = jacu_d_value_21; vdouble tmat31 = jacu_d_value_31; vdouble tmat41 = jacu_d_value_41; vdouble tmat02 = jacu_d_value_02; vdouble tmat12 = jacu_d_value_12; vdouble tmat22 = jacu_d_value_22; vdouble tmat32 = jacu_d_value_32; vdouble tmat42 = jacu_d_value_42; vdouble tmat03 = jacu_d_value_03; vdouble tmat13 = jacu_d_value_13; vdouble tmat23 = jacu_d_value_23; vdouble tmat33 = jacu_d_value_33; vdouble tmat43 = jacu_d_value_43; vdouble tmat04 = jacu_d_value_04; vdouble tmat14 = jacu_d_value_14; vdouble tmat24 = jacu_d_value_24; vdouble tmat34 = jacu_d_value_34; vdouble tmat44 = jacu_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update v. vdouble old_v; old_v = vload(0, v + h_index + 0 * m_offset); old_v -= v0; vstore(old_v, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); old_v -= v1; vstore(old_v, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); old_v -= v2; vstore(old_v, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); old_v -= v3; vstore(old_v, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); old_v -= v4; vstore(old_v, 0, v + h_index + 4 * m_offset); } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial tidy-up. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector/blts.cl.uneven0000644000175600017620000007767711553276230015025 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void blts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); } else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); //vlong b2 = (vlong) ((long) iv >= (long) ist && (long) iv <= (long) iend && (long) jv >= (long) jst && (long) jv <= (long) jend && (long) kv >= 1 && (long) kv <= nz - 2 && (long) depthv >= (long) 0 && (long) depthv <= (long) kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); } // If there are some elements that don't require an update, iterate through the vector. /*} else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } }*/ } // Serial remainder. /*for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); }*/ } kernels/vector/blts.cl.even0000644000175600017620000005216311553277345014451 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); } else { vlong b2 = convert_long2(iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); } } } } kernels/vector/blts.cl0000644000175600017620000005225711553300350013500 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; //const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; const int cellbound = (isiz1 + 4) * (isiz2 + 4); for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; /*if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); //} else {*/ vlong b2 = convert_long2(iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); //} } } } kernels/vector/.nfs00000000c06e7660000000480000644000175600017620000000231711544123620015413 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { // Calculate block offset. int block_depth = k / kblock; int depth = k - (kblock * block_depth); int block_offset = block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock * 5); // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + depth]; if ( (i + j + depth) >= kblock - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. return block_offset + (5 * thread_offset) + m; } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return flat_index(k, j, i, m); } // Macro definitions for blts and buts. #define m_offset (1) kernels/vector/.nfs00000000c02efb42000000440000644000175600017620000000415011544361530015537 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump enough blocks. int block_depth = k / kblock; int depth = k - (kblock * block_depth); offset += block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock); // Jump to the right wavefront. offset += wave_offset_3d[i + j + depth]; // Update thread_offset. if ( (i + j + depth) >= kblock - 1 ) { offset = offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return m * (isiz1+4) * (isiz2+4) * isiz3 + (k * (isiz2 + 4) + j) * (isiz1 + 4) + i; /*int offset = 0; // Add block offset. const int block_i = (i / rhsblock_x); const int block_j = (j / rhsblock_y); const int block_id = (block_j * rhsgrid_x) + block_i; offset += block_id * (rhsblock_x * rhsblock_y * isiz3); // Add thread offset. const int thread_i = i - (block_i * rhsblock_x); const int thread_j = j - (block_j * rhsblock_y); offset += (thread_j * rhsblock_x) + thread_i; // Add k offset. offset += k * (rhsblock_x * rhsblock_y); // Add angle offset. offset += m * (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); // Return. return offset;*/ } // Macro definitions for blts and buts. #define m_offset (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Macro definitions for vector kernels. #define vint int2 #define vdouble double2 #define vlong long2 #define vlength 2 #define vload vload2 #define vstore vstore2 kernels/scalar/rearrangement.cl0000644000175600017620000002046511541645670015343 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = flat_input[flat_index(k, j, i, 0)]; tiled_output[tiled_index(k, j, i, 1)] = flat_input[flat_index(k, j, i, 1)]; tiled_output[tiled_index(k, j, i, 2)] = flat_input[flat_index(k, j, i, 2)]; tiled_output[tiled_index(k, j, i, 3)] = flat_input[flat_index(k, j, i, 3)]; tiled_output[tiled_index(k, j, i, 4)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = tiled_input[tiled_index(k, j, i, 0)]; flat_output[flat_index(k, j, i, 1)] = tiled_input[tiled_index(k, j, i, 1)]; flat_output[flat_index(k, j, i, 2)] = tiled_input[tiled_index(k, j, i, 2)]; flat_output[flat_index(k, j, i, 3)] = tiled_input[tiled_index(k, j, i, 3)]; flat_output[flat_index(k, j, i, 4)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } kernels/scalar/print.cl0000644000175600017620000000061411535717535013642 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/scalar/pre.cl0000644000175600017620000000135211564201203013252 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/scalar/post.cl0000644000175600017620000000166111564201200013451 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/scalar/l2norm.cl0000644000175600017620000000072311524742105013705 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/scalar/ex3_unpack.cl0000644000175600017620000001044411542631035014534 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/scalar/ex3_pack.cl0000644000175600017620000001031111542631026014162 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/scalar/ex1_unpack.cl0000644000175600017620000001043111542645666014545 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/scalar/ex1_pack.cl0000644000175600017620000001040711542645757014206 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/scalar/buts.cl0000644000175600017620000005063711570737742013476 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } } kernels/scalar/blts.cl0000644000175600017620000004725511570737742013467 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/scalar/.nfs00000000809e7a67000000430000644000175600017620000005063711545055425015403 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } } kernels/scalar/.nfs0000000080933c0f000000280000644000175600017620000004736411545055127015375 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); if (gid == 0) { printf("BLTS: Wavefront %d.\n", wavefront); } // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/scalar/.nfs000000008092c61c000000290000644000175600017620000005074411545055235015372 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); if (gid == 0) { printf("BUTS: Wavefront %d.\n", wavefront); } // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } } kernels/scalar/.nfs00000000802f321f000000320000644000175600017620000004737411545056410015361 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif printf("Wavefront %d: (%d, %d, %d), depth = %d\n", wavefront, i, j, k, k); const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/vector2/bak/blts.cl.vector_old0000644000175600017620000011300411544351602016450 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4))/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Only proceed to calculation if at least one element of the vector needs to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; // Very difficult to vectorise this, since we're going to have some branching... // Current implementation: Do all of the maths, but predicate the store. const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); printf("h_west = %d, u + h_west = %p\n", h_west, u + h_west); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); } else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); } } } // Serial remainder. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } } kernels/vector2/bak/blts.cl.vector0000644000175600017620000010021211553030665015612 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4))/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Only proceed to calculation if at least one element of the vector needs to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; // Very difficult to vectorise this, since we're going to have some branching... // Current implementation: Do all of the maths, but predicate the store. const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); } else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); /*if (any(b.s0)) { v[h_index + 0 * m_offset] = v0.s0; v[h_index + 1 * m_offset] = v1.s0; v[h_index + 2 * m_offset] = v2.s0; v[h_index + 3 * m_offset] = v3.s0; v[h_index + 4 * m_offset] = v4.s0; } if (any(b.s1)) { v[h_index + 1 + 0 * m_offset] = v0.s1; v[h_index + 1 + 1 * m_offset] = v1.s1; v[h_index + 1 + 2 * m_offset] = v2.s1; v[h_index + 1 + 3 * m_offset] = v3.s1; v[h_index + 1 + 4 * m_offset] = v4.s1; }*/ } } } // Serial remainder. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } } kernels/vector2/bak/blts.cl.scalar0000644000175600017620000004665111544123620015567 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/vector2/.svn/entries0000444000175600017620000000456411551607761014570 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 nvidia.clh file 2011-03-28T15:21:53.125355Z ae20b732d4bd56a5f719c4559d7a557f 2011-03-23T14:53:37.138628Z 1172 sjp 224 post.cl file 1179 2011-03-29T14:25:34.000000Z 0ebcf921e1d1e641c2e0b2e78a2f88f1 2011-03-31T11:08:35.335679Z 1179 sjp 1987 pre.cl file 1179 2011-03-29T14:25:18.000000Z 9805475bce7430d7b5647bd75618f755 2011-03-31T11:08:35.335679Z 1179 sjp 1692 aos.clh file 1179 2011-03-31T11:05:15.244960Z 2849f6e9f5f59150c3810c2d3602b7a8 2011-03-31T11:08:35.335679Z 1179 sjp 1605 rearrangement.cl file 1179 2011-03-29T14:25:03.000000Z 276f84f8cdcaf261d992f7bb22c8e82f 2011-03-31T11:08:35.335679Z 1179 sjp 8079 soa.clh file 1179 2011-03-31T11:05:19.453877Z 067aa25dba821ddaaa50f4c91c00a810 2011-03-31T11:08:35.335679Z 1179 sjp 2617 ex1_pack.cl file 1179 2011-03-28T15:21:52.000000Z 5c470f1c79bc27ed17f20486c5e4f182 2011-03-31T11:08:35.335679Z 1179 sjp 4359 print.cl file 2011-03-28T15:21:53.000000Z b76116d749d465c134ebe9223072010a 2011-03-23T14:53:37.138628Z 1172 sjp 396 ex3_pack.cl file 1179 2011-03-28T15:21:53.000000Z d9e92cd1106a7485e047a2bc45e0abb2 2011-03-31T11:08:35.335679Z 1179 sjp 4297 ex1_unpack.cl file 1179 2011-03-28T15:21:52.000000Z be26ec8a2d3fd30366ac331bcd38ce5f 2011-03-31T11:08:35.335679Z 1179 sjp 4377 rhs dir ex3_unpack.cl file 1179 2011-03-28T15:21:53.000000Z 74d5059ec9fbb485b7a354c93981d489 2011-03-31T11:08:35.335679Z 1179 sjp 4388 blts.cl file 1179 2011-03-29T14:09:05.000000Z 127d3d99224aa1a03911093c9dd80f2f 2011-03-31T11:08:35.335679Z 1179 sjp 32724 amd.clh file 2011-03-28T15:21:52.600365Z 90499e16fe79b2374ae3581d52c6da05 2011-03-23T14:53:37.138628Z 1172 sjp 297 l2norm.cl file 2011-03-28T15:21:53.000000Z eba7c66f757b1a14473b5b1f06929cdb 2011-03-23T14:53:37.138628Z 1172 sjp 467 buts.cl file 1179 2011-03-29T14:16:19.000000Z ecee90aa693a07c32522205e5a9d4184 2011-03-31T11:08:35.335679Z 1179 sjp 33542 kernels/vector2/rhs/rhs_setup.cl0000644000175600017620000000353211553015413015421 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ibound = ((nx + 1)/vlength)*vlength; const int iinc = (isize * vlength); for (i = 2 + (iid*vlength); i <= ibound; i+= iinc) { vdouble rsd_v, frct_v; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; frct_v = vload(0, frct + t_index + 0 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 0 * t_offset); frct_v = vload(0, frct + t_index + 1 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 1 * t_offset); frct_v = vload(0, frct + t_index + 2 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 2 * t_offset); frct_v = vload(0, frct + t_index + 3 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 3 * t_offset); frct_v = vload(0, frct + t_index + 4 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 4 * t_offset); } for (; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/bak/rearrangement.cl0000644000175600017620000001761711544365677016153 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); hyperplane_output[h_index + 0 * m_offset] = flat_input[f_index + 0]; hyperplane_output[h_index + 1 * m_offset] = flat_input[f_index + 1]; hyperplane_output[h_index + 2 * m_offset] = flat_input[f_index + 2]; hyperplane_output[h_index + 3 * m_offset] = flat_input[f_index + 3]; hyperplane_output[h_index + 4 * m_offset] = flat_input[f_index + 4]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); flat_output[f_index + 0] = hyperplane_input[h_index + 0 * m_offset]; flat_output[f_index + 1] = hyperplane_input[h_index + 1 * m_offset]; flat_output[f_index + 2] = hyperplane_input[h_index + 2 * m_offset]; flat_output[f_index + 3] = hyperplane_input[h_index + 3 * m_offset]; flat_output[f_index + 4] = hyperplane_input[h_index + 4 * m_offset]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = flat_input[f_index + 0]; tiled_output[t_index + 1 * t_offset] = flat_input[f_index + 1]; tiled_output[t_index + 2 * t_offset] = flat_input[f_index + 2]; tiled_output[t_index + 3 * t_offset] = flat_input[f_index + 3]; tiled_output[t_index + 4 * t_offset] = flat_input[f_index + 4]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flat_output[f_index + 0] = tiled_input[t_index + 0 * t_offset]; flat_output[f_index + 1] = tiled_input[t_index + 1 * t_offset]; flat_output[f_index + 2] = tiled_input[t_index + 2 * t_offset]; flat_output[f_index + 3] = tiled_input[t_index + 3 * t_offset]; flat_output[f_index + 4] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); hyperplane_output[h_index + 0 * m_offset] = tiled_input[t_index + 0 * t_offset]; hyperplane_output[h_index + 1 * m_offset] = tiled_input[t_index + 1 * t_offset]; hyperplane_output[h_index + 2 * m_offset] = tiled_input[t_index + 2 * t_offset]; hyperplane_output[h_index + 3 * m_offset] = tiled_input[t_index + 3 * t_offset]; hyperplane_output[h_index + 4 * m_offset] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = hyperplane_input[h_index + 0 * m_offset]; tiled_output[t_index + 1 * t_offset] = hyperplane_input[h_index + 1 * m_offset]; tiled_output[t_index + 2 * t_offset] = hyperplane_input[h_index + 2 * m_offset]; tiled_output[t_index + 3 * t_offset] = hyperplane_input[h_index + 3 * m_offset]; tiled_output[t_index + 4 * t_offset] = hyperplane_input[h_index + 4 * m_offset]; } } } } kernels/vector/bak/print.cl0000644000175600017620000000061411544123621014417 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/vector/bak/pre.cl0000644000175600017620000000317311553015436014060 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble dt_v = (vdouble) dt; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; vdouble res = vload(0, rsd + t_index + 0 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 0 * t_offset); res = vload(0, rsd + t_index + 1 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 1 * t_offset); res = vload(0, rsd + t_index + 2 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 2 * t_offset); res = vload(0, rsd + t_index + 3 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 3 * t_offset); res = vload(0, rsd + t_index + 4 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 4 * t_offset); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/vector/bak/post.cl0000644000175600017620000000364011553015427014256 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble tmp_v = (vdouble) tmp; int index; index = tiled_index(k, j, i, 0); vdouble u_v = vload(0, u + index); vdouble r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 1); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 2); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 3); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 4); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); } for (; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/vector/bak/l2norm.cl0000644000175600017620000000072311544123621014475 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/vector/bak/ex3_unpack.cl0000644000175600017620000001044411544123621015325 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/vector/bak/ex3_pack.cl0000644000175600017620000001031111544123621014753 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/vector/bak/ex1_unpack.cl0000644000175600017620000001043111544123620015316 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/vector/bak/ex1_pack.cl0000644000175600017620000001040711544123620014756 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/vector/bak/buts.cl0000644000175600017620000010135711553015505014247 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void buts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = (starting_k - (kblock - 1)) + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vdouble vn0 = vload(0, v + h_below + 0 * m_offset); vdouble vn1 = vload(0, v + h_below + 1 * m_offset); vdouble vn2 = vload(0, v + h_below + 2 * m_offset); vdouble vn3 = vload(0, v + h_below + 3 * m_offset); vdouble vn4 = vload(0, v + h_below + 4 * m_offset); // Read in u neighbour, for calculation of c. vdouble u0 = vload(0, u + h_below + 0 * m_offset); vdouble u1 = vload(0, u + h_below + 1 * m_offset); vdouble u2 = vload(0, u + h_below + 2 * m_offset); vdouble u3 = vload(0, u + h_below + 3 * m_offset); vdouble u4 = vload(0, u + h_below + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_below + 0 * m_offset); vdouble v0 = omega * ( jacu_c_value_00 * vn0 ); vdouble v1 = omega * ( jacu_c_value_01 * vn0 ); vdouble v2 = omega * ( jacu_c_value_02 * vn0 ); vdouble v3 = omega * ( jacu_c_value_03 * vn0 ); vdouble v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = vload(0, v + h_below + 1 * m_offset); v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = vload(0, v + h_below + 2 * m_offset); v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = vload(0, v + h_below + 3 * m_offset); v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = vload(0, v + h_below + 4 * m_offset); v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_south + 0 * m_offset); vn1 = vload(0, v + h_south + 1 * m_offset); vn2 = vload(0, v + h_south + 2 * m_offset); vn3 = vload(0, v + h_south + 3 * m_offset); vn4 = vload(0, v + h_south + 4 * m_offset); // Read in u neighbour, for calculation of b. u0 = vload(0, u + h_south + 0 * m_offset); u1 = vload(0, u + h_south + 1 * m_offset); u2 = vload(0, u + h_south + 2 * m_offset); u3 = vload(0, u + h_south + 3 * m_offset); u4 = vload(0, u + h_south + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_south + 0 * m_offset); v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = vload(0, v + h_south + 1 * m_offset); v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = vload(0, v + h_south + 2 * m_offset); v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = vload(0, v + h_south + 3 * m_offset); v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = vload(0, v + h_south + 4 * m_offset); v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_east + 0 * m_offset); vn1 = vload(0, v + h_east + 1 * m_offset); vn2 = vload(0, v + h_east + 2 * m_offset); vn3 = vload(0, v + h_east + 3 * m_offset); vn4 = vload(0, v + h_east + 4 * m_offset); // Read in u neighbour, for calculation of a. u0 = vload(0, u + h_east + 0 * m_offset); u1 = vload(0, u + h_east + 1 * m_offset); u2 = vload(0, u + h_east + 2 * m_offset); u3 = vload(0, u + h_east + 3 * m_offset); u4 = vload(0, u + h_east + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_east + 0 * m_offset); v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = vload(0, v + h_east + 1 * m_offset); v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = vload(0, v + h_east + 2 * m_offset); v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = vload(0, v + h_east + 3 * m_offset); v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = vload(0, v + h_east + 4 * m_offset); v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacu_d_value_00; vdouble tmat10 = jacu_d_value_10; vdouble tmat20 = jacu_d_value_20; vdouble tmat30 = jacu_d_value_30; vdouble tmat40 = jacu_d_value_40; vdouble tmat01 = jacu_d_value_01; vdouble tmat11 = jacu_d_value_11; vdouble tmat21 = jacu_d_value_21; vdouble tmat31 = jacu_d_value_31; vdouble tmat41 = jacu_d_value_41; vdouble tmat02 = jacu_d_value_02; vdouble tmat12 = jacu_d_value_12; vdouble tmat22 = jacu_d_value_22; vdouble tmat32 = jacu_d_value_32; vdouble tmat42 = jacu_d_value_42; vdouble tmat03 = jacu_d_value_03; vdouble tmat13 = jacu_d_value_13; vdouble tmat23 = jacu_d_value_23; vdouble tmat33 = jacu_d_value_33; vdouble tmat43 = jacu_d_value_43; vdouble tmat04 = jacu_d_value_04; vdouble tmat14 = jacu_d_value_14; vdouble tmat24 = jacu_d_value_24; vdouble tmat34 = jacu_d_value_34; vdouble tmat44 = jacu_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update v. vdouble old_v; old_v = vload(0, v + h_index + 0 * m_offset); old_v -= v0; vstore(old_v, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); old_v -= v1; vstore(old_v, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); old_v -= v2; vstore(old_v, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); old_v -= v3; vstore(old_v, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); old_v -= v4; vstore(old_v, 0, v + h_index + 4 * m_offset); } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial tidy-up. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector/bak/blts.cl.vector_old0000644000175600017620000011300411544351602016366 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4))/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Only proceed to calculation if at least one element of the vector needs to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; // Very difficult to vectorise this, since we're going to have some branching... // Current implementation: Do all of the maths, but predicate the store. const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- v(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- ua(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- va(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- un(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vn(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); printf("h_west = %d, u + h_west = %p\n", h_west, u + h_west); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- uw(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, vn0.s0, vn1.s0, vn2.s0, vn3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, vn0.s1, vn1.s1, vn2.s1, vn3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, vn0.s2, vn1.s2, vn2.s2, vn3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vw(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, vn0.s3, vn1.s3, vn2.s3, vn3.s3); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, u0.s0, u1.s0, u2.s0, u3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, u0.s1, u1.s1, u2.s1, u3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, u0.s2, u1.s2, u2.s2, u3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- u(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, u0.s3, u1.s3, u2.s3, u3.s3); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- (%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); } else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s0, jv.s0, kv.s0, b.s0, v0.s0, v1.s0, v2.s0, v3.s0); printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s1, jv.s1, kv.s1, b.s1, v0.s1, v1.s1, v2.s1, v3.s1); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s2, jv.s2, kv.s2, b.s2, v0.s2, v1.s2, v2.s2, v3.s2); //printf("Wavefront %d: (%d, %d, %d) -- b = %d -- vf(%e, %e, %e, %e, %e)\n", wavefront, iv.s3, jv.s3, kv.s3, b.s3, v0.s3, v1.s3, v2.s3, v3.s3); } } } // Serial remainder. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } } kernels/vector/bak/blts.cl.vector0000644000175600017620000010021211544323734015532 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4))/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Only proceed to calculation if at least one element of the vector needs to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; // Very difficult to vectorise this, since we're going to have some branching... // Current implementation: Do all of the maths, but predicate the store. const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); } else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); /*if (any(b.s0)) { v[h_index + 0 * m_offset] = v0.s0; v[h_index + 1 * m_offset] = v1.s0; v[h_index + 2 * m_offset] = v2.s0; v[h_index + 3 * m_offset] = v3.s0; v[h_index + 4 * m_offset] = v4.s0; } if (any(b.s1)) { v[h_index + 1 + 0 * m_offset] = v0.s1; v[h_index + 1 + 1 * m_offset] = v1.s1; v[h_index + 1 + 2 * m_offset] = v2.s1; v[h_index + 1 + 3 * m_offset] = v3.s1; v[h_index + 1 + 4 * m_offset] = v4.s1; }*/ } } } // Serial remainder. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } } kernels/vector/bak/blts.cl.scalar0000644000175600017620000004665111544123620015505 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/vector/bak/blts.cl0000644000175600017620000007767511553015516014257 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void blts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; //if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); /*} else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); //vlong b2 = (vlong) ((long) iv >= (long) ist && (long) iv <= (long) iend && (long) jv >= (long) jst && (long) jv <= (long) jend && (long) kv >= 1 && (long) kv <= nz - 2 && (long) depthv >= (long) 0 && (long) depthv <= (long) kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); }*/ // If there are some elements that don't require an update, iterate through the vector. } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial remainder. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector/.svn/entries0000444000175600017620000000341111753220711014462 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector svn://svn/perfmodelling 2011-05-30T16:07:57.886554Z 1351 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 post.cl file 2011-03-21T13:34:36.000000Z be6430c571bb7a19d566c8a77336aa70 2011-04-20T18:02:45.237789Z 1211 sjp 945 pre.cl file 2011-03-21T13:34:09.000000Z 763e063f17ef08657f21b022d1629534 2011-04-20T18:02:45.237789Z 1211 sjp 746 rearrangement.cl file 2011-03-21T12:59:36.000000Z 1614f6a2e60b2fb02efef23686c675bd 2011-04-20T18:02:45.237789Z 1211 sjp 8501 ex1_pack.cl file 2011-03-24T13:49:35.000000Z 5c470f1c79bc27ed17f20486c5e4f182 2011-03-31T11:08:35.335679Z 1179 sjp 4359 print.cl file 2011-03-09T15:40:13.000000Z b76116d749d465c134ebe9223072010a 2011-03-23T14:53:37.138628Z 1172 sjp 396 ex3_pack.cl file 2011-03-24T11:59:18.000000Z d9e92cd1106a7485e047a2bc45e0abb2 2011-03-31T11:08:35.335679Z 1179 sjp 4297 ex1_unpack.cl file 2011-03-24T13:48:38.000000Z be26ec8a2d3fd30366ac331bcd38ce5f 2011-03-31T11:08:35.335679Z 1179 sjp 4377 rhs dir ex3_unpack.cl file 2011-03-24T11:59:25.000000Z 74d5059ec9fbb485b7a354c93981d489 2011-03-31T11:08:35.335679Z 1179 sjp 4388 blts.cl file 2011-04-19T12:32:40.000000Z b8c6c92829a8c3829e860f1cdab01a21 2011-04-20T18:02:45.237789Z 1211 sjp 21679 l2norm.cl file 2011-02-10T10:56:05.000000Z eba7c66f757b1a14473b5b1f06929cdb 2011-03-23T14:53:37.138628Z 1172 sjp 467 buts.cl file 2011-04-18T11:06:45.000000Z 7bb5957a79afd1cb9c6811a1142ce5f7 2011-04-20T18:02:45.237789Z 1211 sjp 33519 kernels/vector/rhs/rhs_zeta.cl0000644000175600017620000001602111553620343015143 0ustar sjpsjp/** * "Fused" version of rhs_zeta_{1,2,3,4,dissipation}. */ __kernel void rhs_zeta_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; int m; double q, u41; const double c1 = c1_def; const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_zeta1 for (k = 0 + kid; k <= nz - 1; k += ksize) { flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } // Rhs_zeta2 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } // Rhs_zeta3 for (k = 1 + kid; k <= nz - 1; k += ksize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } // Rhs_zeta4 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } // Rhs_zeta_dissipation for (k = 1 + kid; k <= nz - 2; k += ksize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/rhs/rhs_xi.cl0000644000175600017620000001567311553047245014640 0ustar sjpsjp/** * "Fused" version of rhs_xi_{1,2,3,4,dissipation}. */ __kernel void rhs_xi_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ //double q, u21; //int L1, L2; int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; int m; double ist1, iend1; const double c1 = c1_def; //const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Set L1. /*if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; }*/ // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { // Rhs_xi1 /*for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; }*/ // Rhs_xi2 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } // Rhs_xi3 for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } // Rhs_xi4 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } // Rhs_xi_dissipation for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/rhs/rhs_setup.cl0000644000175600017620000000162111541651774015351 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { for (i = 2 + iid; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/rhs/rhs_eta.cl0000644000175600017620000001453311553620342014756 0ustar sjpsjp/** * "Fused" version of rhs_eta_{2,3,4,dissipation}. */ __kernel void rhs_eta_kernel ( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; int m; double jst1, jend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_eta2 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } // Rhs_eta3 for (j = jst + jid; j <= L2; j += jsize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } // Rhs_eta4 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } // Rhs_eta_dissipation for (j = jst + jid; j <= jend; j += jsize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/scalar/rhs/rhs_zeta.cl0000644000175600017620000001602111553332316015106 0ustar sjpsjp/** * "Fused" version of rhs_zeta_{1,2,3,4,dissipation}. */ __kernel void rhs_zeta_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; int m; double q, u41; const double c1 = c1_def; const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_zeta1 for (k = 0 + kid; k <= nz - 1; k += ksize) { flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } // Rhs_zeta2 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } // Rhs_zeta3 for (k = 1 + kid; k <= nz - 1; k += ksize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } // Rhs_zeta4 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } // Rhs_zeta_dissipation for (k = 1 + kid; k <= nz - 2; k += ksize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/scalar/rhs/rhs_xi.cl0000644000175600017620000001374111553324253014572 0ustar sjpsjp/** * "Fused" version of rhs_xi_{2,3,4,dissipation}. */ __kernel void rhs_xi_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; int m; double ist1, iend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { // Rhs_xi2 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } // Rhs_xi3 for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } // Rhs_xi4 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } // Rhs_xi_dissipation for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/scalar/rhs/rhs_setup.cl0000644000175600017620000000162111541651774015314 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { for (i = 2 + iid; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/scalar/rhs/rhs_eta.cl0000644000175600017620000001453311553327333014725 0ustar sjpsjp/** * "Fused" version of rhs_eta_{2,3,4,dissipation}. */ __kernel void rhs_eta_kernel ( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; int m; double jst1, jend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_eta2 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } // Rhs_eta3 for (j = jst + jid; j <= L2; j += jsize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } // Rhs_eta4 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } // Rhs_eta_dissipation for (j = jst + jid; j <= jend; j += jsize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/scalar/.svn/entries0000444000175600017620000000341111753220711014425 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/scalar svn://svn/perfmodelling 2011-05-30T16:07:57.886554Z 1351 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 post.cl file 2011-05-16T10:54:56.411742Z be6430c571bb7a19d566c8a77336aa70 2011-03-23T14:53:37.138628Z 1172 sjp 945 pre.cl file 2011-05-16T10:54:59.256813Z 763e063f17ef08657f21b022d1629534 2011-03-23T14:53:37.138628Z 1172 sjp 746 rearrangement.cl file 2011-03-21T12:59:36.000000Z 1614f6a2e60b2fb02efef23686c675bd 2011-03-23T14:53:37.138628Z 1172 sjp 8501 ex1_pack.cl file 2011-03-24T13:49:35.000000Z 5c470f1c79bc27ed17f20486c5e4f182 2011-03-24T14:11:46.547033Z 1174 sjp 4359 print.cl file 2011-03-09T15:40:13.000000Z b76116d749d465c134ebe9223072010a 2011-03-23T14:53:37.138628Z 1172 sjp 396 ex3_pack.cl file 2011-03-24T11:59:18.000000Z d9e92cd1106a7485e047a2bc45e0abb2 2011-03-24T12:02:37.075604Z 1173 sjp 4297 ex1_unpack.cl file 2011-03-24T13:48:38.000000Z be26ec8a2d3fd30366ac331bcd38ce5f 2011-03-24T14:11:46.547033Z 1174 sjp 4377 rhs dir ex3_unpack.cl file 2011-03-24T11:59:25.000000Z 74d5059ec9fbb485b7a354c93981d489 2011-03-24T12:02:37.075604Z 1173 sjp 4388 blts.cl file 2011-05-30T16:03:46.760793Z bdb1eaf468397efd56c68e8bcd2173e2 2011-03-31T11:08:35.335679Z 1179 sjp 20141 l2norm.cl file 2011-02-10T10:56:05.000000Z eba7c66f757b1a14473b5b1f06929cdb 2011-03-23T14:53:37.138628Z 1172 sjp 467 buts.cl file 2011-05-30T16:03:46.887793Z 5df3087bfa495f38020fba7ee40ed80d 2011-03-31T11:08:35.335679Z 1179 sjp 20895 kernels/vector2/.svn/text-base/soa.clh.svn-base0000444000175600017620000000507111545060262020041 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { #ifdef APPLU_BLOCKING_OLD int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump enough blocks. int block_depth = k / kblock; int depth = k - (kblock * block_depth); offset += block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock); // Jump to the right wavefront. offset += wave_offset_3d[i + j + depth]; // Update thread_offset. if ( (i + j + depth) >= kblock - 1 ) { offset = offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; #else //#ifdef APPLU_BLOCKING_NEW int offset = 0; // Calculate thread id. offset += thread_mapping[(j * (isiz1 + 4)) + i]; // Jump to the right wavefront. offset += wave_offset_3d[i + j + k]; // Update thread_offset. if ( (i + j + k) >= isiz3 - 1 ) { offset = offset - wave_offset_2d[(i + j + k) - (isiz3 - 1)]; } // Add angle offset. offset += (m * problem_height * (isiz2 + 4) * (isiz1 + 4)); return offset; #endif } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return m * (isiz1+4) * (isiz2+4) * isiz3 + (k * (isiz2 + 4) + j) * (isiz1 + 4) + i; /*int offset = 0; // Add block offset. const int block_i = (i / rhsblock_x); const int block_j = (j / rhsblock_y); const int block_id = (block_j * rhsgrid_x) + block_i; offset += block_id * (rhsblock_x * rhsblock_y * isiz3); // Add thread offset. const int thread_i = i - (block_i * rhsblock_x); const int thread_j = j - (block_j * rhsblock_y); offset += (thread_j * rhsblock_x) + thread_i; // Add k offset. offset += k * (rhsblock_x * rhsblock_y); // Add angle offset. offset += m * (rhsgrid_x * rhsgrid_y) * (rhsblock_x * rhsblock_y * isiz3); // Return. return offset;*/ } // Macro definitions for blts and buts. #define m_offset (problem_height * (isiz2 + 4) * (isiz1 + 4)) // Macro definitions for vector kernels. #define vint int2 #define vdouble double2 #define vlong long2 #define vlength 2 #define vload vload2 #define vstore vstore2 kernels/vector2/.svn/text-base/rearrangement.cl.svn-base0000444000175600017620000001761711545060262021752 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); hyperplane_output[h_index + 0 * m_offset] = flat_input[f_index + 0]; hyperplane_output[h_index + 1 * m_offset] = flat_input[f_index + 1]; hyperplane_output[h_index + 2 * m_offset] = flat_input[f_index + 2]; hyperplane_output[h_index + 3 * m_offset] = flat_input[f_index + 3]; hyperplane_output[h_index + 4 * m_offset] = flat_input[f_index + 4]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int f_index = flat_index(k, j, i, 0); flat_output[f_index + 0] = hyperplane_input[h_index + 0 * m_offset]; flat_output[f_index + 1] = hyperplane_input[h_index + 1 * m_offset]; flat_output[f_index + 2] = hyperplane_input[h_index + 2 * m_offset]; flat_output[f_index + 3] = hyperplane_input[h_index + 3 * m_offset]; flat_output[f_index + 4] = hyperplane_input[h_index + 4 * m_offset]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = flat_input[f_index + 0]; tiled_output[t_index + 1 * t_offset] = flat_input[f_index + 1]; tiled_output[t_index + 2 * t_offset] = flat_input[f_index + 2]; tiled_output[t_index + 3 * t_offset] = flat_input[f_index + 3]; tiled_output[t_index + 4 * t_offset] = flat_input[f_index + 4]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int f_index = flat_index(k, j, i, 0); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flat_output[f_index + 0] = tiled_input[t_index + 0 * t_offset]; flat_output[f_index + 1] = tiled_input[t_index + 1 * t_offset]; flat_output[f_index + 2] = tiled_input[t_index + 2 * t_offset]; flat_output[f_index + 3] = tiled_input[t_index + 3 * t_offset]; flat_output[f_index + 4] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); hyperplane_output[h_index + 0 * m_offset] = tiled_input[t_index + 0 * t_offset]; hyperplane_output[h_index + 1 * m_offset] = tiled_input[t_index + 1 * t_offset]; hyperplane_output[h_index + 2 * m_offset] = tiled_input[t_index + 2 * t_offset]; hyperplane_output[h_index + 3 * m_offset] = tiled_input[t_index + 3 * t_offset]; hyperplane_output[h_index + 4 * m_offset] = tiled_input[t_index + 4 * t_offset]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); tiled_output[t_index + 0 * t_offset] = hyperplane_input[h_index + 0 * m_offset]; tiled_output[t_index + 1 * t_offset] = hyperplane_input[h_index + 1 * m_offset]; tiled_output[t_index + 2 * t_offset] = hyperplane_input[h_index + 2 * m_offset]; tiled_output[t_index + 3 * t_offset] = hyperplane_input[h_index + 3 * m_offset]; tiled_output[t_index + 4 * t_offset] = hyperplane_input[h_index + 4 * m_offset]; } } } } kernels/vector2/.svn/text-base/print.cl.svn-base0000444000175600017620000000061411542404560020241 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/vector2/.svn/text-base/pre.cl.svn-base0000444000175600017620000000323411545060262017674 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble dt_v = (vdouble) dt; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; vdouble res = vload(0, rsd + t_index + 0 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 0 * t_offset); res = vload(0, rsd + t_index + 1 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 1 * t_offset); res = vload(0, rsd + t_index + 2 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 2 * t_offset); res = vload(0, rsd + t_index + 3 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 3 * t_offset); res = vload(0, rsd + t_index + 4 * t_offset); res *= dt_v; vstore(res, 0, rsd + t_index + 4 * t_offset); } if (iid == 0) { for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } } kernels/vector2/.svn/text-base/post.cl.svn-base0000444000175600017620000000370311545060262020074 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble tmp_v = (vdouble) tmp; int index; index = tiled_index(k, j, i, 0); vdouble u_v = vload(0, u + index); vdouble r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 1); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 2); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 3); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); index = tiled_index(k, j, i, 4); u_v = vload(0, u + index); r_v = vload(0, rsd + index); u_v += tmp_v * r_v; vstore(u_v, 0, u + index); } if (iid == 0) { for (; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } } kernels/vector2/.svn/text-base/nvidia.clh.svn-base0000444000175600017620000000034011542404556020530 0ustar sjpsjp// Pragma required to enable double precision. #pragma OPENCL EXTENSION cl_khr_fp64 : enable #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) kernels/vector2/.svn/text-base/l2norm.cl.svn-base0000444000175600017620000000072311542404556020324 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/vector2/.svn/text-base/ex3_unpack.cl.svn-base0000444000175600017620000001044411545060262021147 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/vector2/.svn/text-base/ex3_pack.cl.svn-base0000444000175600017620000001031111545060262020575 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/vector2/.svn/text-base/ex1_unpack.cl.svn-base0000444000175600017620000001043111545060262021141 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/vector2/.svn/text-base/ex1_pack.cl.svn-base0000444000175600017620000001040711545060262020601 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/vector2/.svn/text-base/buts.cl.svn-base0000444000175600017620000010140611545060261020062 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void buts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = (starting_k - (kblock - 1)) + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vdouble vn0 = vload(0, v + h_below + 0 * m_offset); vdouble vn1 = vload(0, v + h_below + 1 * m_offset); vdouble vn2 = vload(0, v + h_below + 2 * m_offset); vdouble vn3 = vload(0, v + h_below + 3 * m_offset); vdouble vn4 = vload(0, v + h_below + 4 * m_offset); // Read in u neighbour, for calculation of c. vdouble u0 = vload(0, u + h_below + 0 * m_offset); vdouble u1 = vload(0, u + h_below + 1 * m_offset); vdouble u2 = vload(0, u + h_below + 2 * m_offset); vdouble u3 = vload(0, u + h_below + 3 * m_offset); vdouble u4 = vload(0, u + h_below + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_below + 0 * m_offset); vdouble v0 = omega * ( jacu_c_value_00 * vn0 ); vdouble v1 = omega * ( jacu_c_value_01 * vn0 ); vdouble v2 = omega * ( jacu_c_value_02 * vn0 ); vdouble v3 = omega * ( jacu_c_value_03 * vn0 ); vdouble v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = vload(0, v + h_below + 1 * m_offset); v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = vload(0, v + h_below + 2 * m_offset); v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = vload(0, v + h_below + 3 * m_offset); v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = vload(0, v + h_below + 4 * m_offset); v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_south + 0 * m_offset); vn1 = vload(0, v + h_south + 1 * m_offset); vn2 = vload(0, v + h_south + 2 * m_offset); vn3 = vload(0, v + h_south + 3 * m_offset); vn4 = vload(0, v + h_south + 4 * m_offset); // Read in u neighbour, for calculation of b. u0 = vload(0, u + h_south + 0 * m_offset); u1 = vload(0, u + h_south + 1 * m_offset); u2 = vload(0, u + h_south + 2 * m_offset); u3 = vload(0, u + h_south + 3 * m_offset); u4 = vload(0, u + h_south + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_south + 0 * m_offset); v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = vload(0, v + h_south + 1 * m_offset); v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = vload(0, v + h_south + 2 * m_offset); v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = vload(0, v + h_south + 3 * m_offset); v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = vload(0, v + h_south + 4 * m_offset); v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_east + 0 * m_offset); vn1 = vload(0, v + h_east + 1 * m_offset); vn2 = vload(0, v + h_east + 2 * m_offset); vn3 = vload(0, v + h_east + 3 * m_offset); vn4 = vload(0, v + h_east + 4 * m_offset); // Read in u neighbour, for calculation of a. u0 = vload(0, u + h_east + 0 * m_offset); u1 = vload(0, u + h_east + 1 * m_offset); u2 = vload(0, u + h_east + 2 * m_offset); u3 = vload(0, u + h_east + 3 * m_offset); u4 = vload(0, u + h_east + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_east + 0 * m_offset); v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = vload(0, v + h_east + 1 * m_offset); v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = vload(0, v + h_east + 2 * m_offset); v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = vload(0, v + h_east + 3 * m_offset); v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = vload(0, v + h_east + 4 * m_offset); v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacu_d_value_00; vdouble tmat10 = jacu_d_value_10; vdouble tmat20 = jacu_d_value_20; vdouble tmat30 = jacu_d_value_30; vdouble tmat40 = jacu_d_value_40; vdouble tmat01 = jacu_d_value_01; vdouble tmat11 = jacu_d_value_11; vdouble tmat21 = jacu_d_value_21; vdouble tmat31 = jacu_d_value_31; vdouble tmat41 = jacu_d_value_41; vdouble tmat02 = jacu_d_value_02; vdouble tmat12 = jacu_d_value_12; vdouble tmat22 = jacu_d_value_22; vdouble tmat32 = jacu_d_value_32; vdouble tmat42 = jacu_d_value_42; vdouble tmat03 = jacu_d_value_03; vdouble tmat13 = jacu_d_value_13; vdouble tmat23 = jacu_d_value_23; vdouble tmat33 = jacu_d_value_33; vdouble tmat43 = jacu_d_value_43; vdouble tmat04 = jacu_d_value_04; vdouble tmat14 = jacu_d_value_14; vdouble tmat24 = jacu_d_value_24; vdouble tmat34 = jacu_d_value_34; vdouble tmat44 = jacu_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update v. vdouble old_v; old_v = vload(0, v + h_index + 0 * m_offset); old_v -= v0; vstore(old_v, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); old_v -= v1; vstore(old_v, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); old_v -= v2; vstore(old_v, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); old_v -= v3; vstore(old_v, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); old_v -= v4; vstore(old_v, 0, v + h_index + 4 * m_offset); } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial tidy-up. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } } kernels/vector2/.svn/text-base/blts.cl.svn-base0000444000175600017620000007772411545060262020071 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void blts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; //if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); /*} else { vlong b2 = (vlong) (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); //vlong b2 = (vlong) ((long) iv >= (long) ist && (long) iv <= (long) iend && (long) jv >= (long) jst && (long) jv <= (long) jend && (long) kv >= 1 && (long) kv <= nz - 2 && (long) depthv >= (long) 0 && (long) depthv <= (long) kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); }*/ // If there are some elements that don't require an update, iterate through the vector. } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial remainder. if (gid == 0) { for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { blts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } } kernels/vector2/.svn/text-base/aos.clh.svn-base0000444000175600017620000000310511545060262020035 0ustar sjpsjp// Device function for the calculation of flat indices. inline int flat_index(const int k, const int j, const int i, const int m) { return ((k * (isiz2 + 4) + j) * (isiz1 + 4) + i) * 5 + m; } /** * Device function to calculate hyperplane index. * Note: Access to thread_mapping is uncoalesced! */ inline int hyperplane_index(const int k, const int j, const int i, const int m, __global const int* wave_offset_2d, __global const int* wave_offset_3d, __global const int* thread_mapping) { #ifdef APPLU_BLOCKING_OLD // Calculate block offset. int block_depth = k / kblock; int depth = k - (kblock * block_depth); int block_offset = block_depth * ((isiz1 + 4) * (isiz2 + 4) * kblock * 5); // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + depth]; if ( (i + j + depth) >= kblock - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + depth) - (kblock - 1)]; } // Add angle offset. return block_offset + (5 * thread_offset) + m; #else //#ifdef APPLU_BLOCKING_NEW // Calculate thread offset. int thread_offset = thread_mapping[(j * (isiz1 + 4)) + i]; thread_offset += wave_offset_3d[i + j + k]; if ( (i + j + k) >= isiz3 - 1 ) { thread_offset = thread_offset - wave_offset_2d[(i + j + k) - (isiz3 - 1)]; } // Add angle offset. return (5 * thread_offset) + m; #endif } /** * Calculate the tiled index for ursd. */ inline int tiled_index(const int k, const int j, const int i, const int m) { return flat_index(k, j, i, m); } // Macro definitions for blts and buts. #define m_offset (1) kernels/vector2/.svn/text-base/amd.clh.svn-base0000444000175600017620000000045111542404556020022 0ustar sjpsjp// Pragma required to enable double precision. #pragma OPENCL EXTENSION cl_amd_fp64 : enable #pragma OPENCL EXTENSION cl_amd_printf : enable #define PRINTF_SUPPORTED #define c1_def (1.40e+00) #define c2_def (0.40e+00) #define c3_def (1.00e-01) #define c4_def (1.00e+00) #define c5_def (1.40e+00) kernels/vector2/rhs/zeta/rhs_zeta_dissipation.cl0000644000175600017620000000466411544123634020611 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector2/rhs/zeta/rhs_zeta4.cl0000644000175600017620000000436611544123634016266 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector2/rhs/zeta/rhs_zeta3.cl0000644000175600017620000000341211544123634016254 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector2/rhs/zeta/rhs_zeta2.cl0000644000175600017620000000257511544123634016264 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector2/rhs/zeta/rhs_zeta1.cl0000644000175600017620000000267711544123634016266 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector2/rhs/xi/rhs_xi_dissipation.cl0000644000175600017620000000471711544123627017744 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector2/rhs/xi/rhs_xi4.cl0000644000175600017620000001212511553015412015402 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd0_v = vload(0, rsd + tiled_index(k, j, i, 0)); vdouble um0_v = vload(0, u + tiled_index(k, j, i-1, 0)); vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble up0_v = vload(0, u + tiled_index(k, j, i+1, 0)); rsd0_v += dx1 * tx1 * ( um0_v - 2.0e+00 * u0_v + up0_v ); vstore(rsd0_v, 0, rsd + tiled_index(k, j, i, 0)); vdouble rsd1_v = vload(0, rsd + tiled_index(k, j, i, 1)); vdouble um1_v = vload(0, u + tiled_index(k, j, i-1, 1)); vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); vdouble up1_v = vload(0, u + tiled_index(k, j, i+1, 1)); vdouble fluxp1_v = vload(0, flux + tiled_index(k, j, i+1, 1)); vdouble flux1_v = vload(0, flux + tiled_index(k, j, i, 1)); rsd1_v += tx3 * c3 * c4 * ( fluxp1_v - flux1_v ) + dx2 * tx1 * ( um1_v - 2.0e+00 * u1_v + up1_v ); vstore(rsd1_v, 0, rsd + tiled_index(k, j, i, 1)); vdouble rsd2_v = vload(0, rsd + tiled_index(k, j, i, 2)); vdouble um2_v = vload(0, u + tiled_index(k, j, i-1, 2)); vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); vdouble up2_v = vload(0, u + tiled_index(k, j, i+1, 2)); vdouble fluxp2_v = vload(0, flux + tiled_index(k, j, i+1, 2)); vdouble flux2_v = vload(0, flux + tiled_index(k, j, i, 2)); rsd2_v += tx3 * c3 * c4 * ( fluxp2_v - flux2_v ) + dx3 * tx1 * ( um2_v - 2.0e+00 * u2_v + up2_v ); vstore(rsd2_v, 0, rsd + tiled_index(k, j, i, 2)); vdouble rsd3_v = vload(0, rsd + tiled_index(k, j, i, 3)); vdouble um3_v = vload(0, u + tiled_index(k, j, i-1, 3)); vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); vdouble up3_v = vload(0, u + tiled_index(k, j, i+1, 3)); vdouble fluxp3_v = vload(0, flux + tiled_index(k, j, i+1, 3)); vdouble flux3_v = vload(0, flux + tiled_index(k, j, i, 3)); rsd3_v += tx3 * c3 * c4 * ( fluxp3_v - flux3_v ) + dx4 * tx1 * ( um3_v - 2.0e+00 * u3_v + up3_v ); vstore(rsd3_v, 0, rsd + tiled_index(k, j, i, 3)); vdouble rsd4_v = vload(0, rsd + tiled_index(k, j, i, 4)); vdouble um4_v = vload(0, u + tiled_index(k, j, i-1, 4)); vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); vdouble up4_v = vload(0, u + tiled_index(k, j, i+1, 4)); vdouble fluxp4_v = vload(0, flux + tiled_index(k, j, i+1, 4)); vdouble flux4_v = vload(0, flux + tiled_index(k, j, i, 4)); rsd4_v += tx3 * c3 * c4 * ( fluxp4_v - flux4_v ) + dx5 * tx1 * ( um4_v - 2.0e+00 * u4_v + up4_v ); vstore(rsd4_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector2/rhs/xi/rhs_xi3.cl0000644000175600017620000000754611553015327015421 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21i = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31i = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41i = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51i = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j, i-1, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j, i-1, 1)); const vdouble u21im1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j, i-1, 2)); const vdouble u31im1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j, i-1, 3)); const vdouble u41im1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j, i-1, 4)); const vdouble u51im1 = tmp * u4m_v; const vdouble flux1_v = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); const vdouble flux2_v = tx3 * ( u31i - u31im1 ); const vdouble flux3_v = tx3 * ( u41i - u41im1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= L2; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21i = tmp * u[tiled_index(k, j, i, 1)]; const double u31i = tmp * u[tiled_index(k, j, i, 2)]; const double u41i = tmp * u[tiled_index(k, j, i, 3)]; const double u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; const double u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; const double u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; const double u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; const double u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector2/rhs/xi/rhs_xi2.cl0000644000175600017620000000524211553015372015407 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fip_v, fim_v; const vdouble tx2_v = (vdouble) tx2; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 0)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 0)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 1)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 1)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 2)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 2)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 3)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 3)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 4)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 4)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/vector2/rhs/xi/rhs_xi1.cl0000644000175600017620000000564611553015400015406 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = L1 + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u21 = u1_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u1_v; vdouble flux1_v = u1_v * u21 + c2 * ( u4_v - q ); vdouble flux2_v = u2_v * u21; vdouble flux3_v = u3_v * u21; vdouble flux4_v = (c1 * u4_v - c2 * q) * u21; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } for (; i <= L2; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flux[t_index + 0 * t_offset] = u[t_index + 1 * t_offset]; const double u21 = u[t_index + 1 * t_offset] / u[t_index + 0 * t_offset]; const double q = 0.50e+00 * ( u[t_index + 1 * t_offset] * u[t_index + 1 * t_offset] + u[t_index + 2 * t_offset] * u[t_index + 2 * t_offset] + u[t_index + 3 * t_offset] * u[t_index + 3 * t_offset] ) / u[t_index + 0 * t_offset]; flux[t_index + 1 * t_offset] = u[t_index + 1 * t_offset] * u21 + c2 * ( u[t_index + 4 * t_offset] - q ); flux[t_index + 2 * t_offset] = u[t_index + 2 * t_offset] * u21; flux[t_index + 3 * t_offset] = u[t_index + 3 * t_offset] * u21; flux[t_index + 4 * t_offset] = ( c1 * u[t_index + 4 * t_offset] - c2 * q ) * u21; } } } } kernels/vector2/rhs/eta/rhs_eta_dissipation.cl0000644000175600017620000000527411544123630020217 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector2/rhs/eta/rhs_eta4.cl0000644000175600017620000000441311544123630015667 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector2/rhs/eta/rhs_eta3.cl0000644000175600017620000000754611553015551015702 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21j = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31j = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41j = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51j = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j-1, i, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j-1, i, 1)); const vdouble u21jm1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j-1, i, 2)); const vdouble u31jm1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j-1, i, 3)); const vdouble u41jm1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j-1, i, 4)); const vdouble u51jm1 = tmp * u4m_v; const vdouble flux1_v = ty3 * ( u21j - u21jm1 ); const vdouble flux2_v = (4.0e+00/3.0e+00) * ty3 * ( u31j - u31jm1 ); const vdouble flux3_v = ty3 * ( u41j - u41jm1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21j = tmp * u[tiled_index(k, j, i, 1)]; const double u31j = tmp * u[tiled_index(k, j, i, 2)]; const double u41j = tmp * u[tiled_index(k, j, i, 3)]; const double u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; const double u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; const double u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; const double u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; const double u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/vector2/rhs/eta/rhs_eta2.cl0000644000175600017620000000515311553015562015673 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fjp_v, fjm_v; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 0)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 0)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 1)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 1)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 2)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 2)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 3)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 3)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 4)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 4)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/vector2/rhs/eta/rhs_eta1.cl0000644000175600017620000000545011553015574015675 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u31 = u2_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u2_v; vdouble flux1_v = u1_v * u31; vdouble flux2_v = u2_v * u31 + c2 * (u4_v - q); vdouble flux3_v = u3_v * u31; vdouble flux4_v = (c1 * u4_v - c2 * q) * u31; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } for (; i <= iend; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in the u values. double u0 = u[t_index + 0 * t_offset]; double u1 = u[t_index + 1 * t_offset]; double u2 = u[t_index + 2 * t_offset]; double u3 = u[t_index + 3 * t_offset]; double u4 = u[t_index + 4 * t_offset]; // Update flux. flux[t_index + 0 * t_offset] = u2; const double u31 = u2 / u0; const double q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[t_index + 1 * t_offset] = u1 * u31; flux[t_index + 2 * t_offset] = u2 * u31 + c2 * ( u4 - q ); flux[t_index + 3 * t_offset] = u3 * u31; flux[t_index + 4 * t_offset] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/vector2/rhs/.svn/entries0000444000175600017620000000056711551607761015363 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 xi dir zeta dir eta dir rhs_setup.cl file 1179 2011-03-29T14:22:41.000000Z 4a55f4166fdc1e0bfa3cf33fcfabe3e2 2011-03-31T11:08:35.335679Z 1179 sjp 1882 kernels/vector/bak/rhs/rhs_setup.cl0000644000175600017620000000353211553015413016074 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ibound = ((nx + 1)/vlength)*vlength; const int iinc = (isize * vlength); for (i = 2 + (iid*vlength); i <= ibound; i+= iinc) { vdouble rsd_v, frct_v; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; frct_v = vload(0, frct + t_index + 0 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 0 * t_offset); frct_v = vload(0, frct + t_index + 1 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 1 * t_offset); frct_v = vload(0, frct + t_index + 2 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 2 * t_offset); frct_v = vload(0, frct + t_index + 3 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 3 * t_offset); frct_v = vload(0, frct + t_index + 4 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 4 * t_offset); } for (; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/.svn/text-base/rearrangement.cl.svn-base0000444000175600017620000002046511553617672021677 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = flat_input[flat_index(k, j, i, 0)]; tiled_output[tiled_index(k, j, i, 1)] = flat_input[flat_index(k, j, i, 1)]; tiled_output[tiled_index(k, j, i, 2)] = flat_input[flat_index(k, j, i, 2)]; tiled_output[tiled_index(k, j, i, 3)] = flat_input[flat_index(k, j, i, 3)]; tiled_output[tiled_index(k, j, i, 4)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = tiled_input[tiled_index(k, j, i, 0)]; flat_output[flat_index(k, j, i, 1)] = tiled_input[tiled_index(k, j, i, 1)]; flat_output[flat_index(k, j, i, 2)] = tiled_input[tiled_index(k, j, i, 2)]; flat_output[flat_index(k, j, i, 3)] = tiled_input[tiled_index(k, j, i, 3)]; flat_output[flat_index(k, j, i, 4)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } kernels/vector/.svn/text-base/print.cl.svn-base0000444000175600017620000000061411553617365020172 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/vector/.svn/text-base/pre.cl.svn-base0000444000175600017620000000135211553617672017625 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/vector/.svn/text-base/post.cl.svn-base0000444000175600017620000000166111553617671020026 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/vector/.svn/text-base/l2norm.cl.svn-base0000444000175600017620000000072311553617365020250 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/vector/.svn/text-base/ex3_unpack.cl.svn-base0000444000175600017620000001044411553617364021077 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/vector/.svn/text-base/ex3_pack.cl.svn-base0000444000175600017620000001031111553617364020525 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/vector/.svn/text-base/ex1_unpack.cl.svn-base0000444000175600017620000001043111553617365021072 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/vector/.svn/text-base/ex1_pack.cl.svn-base0000444000175600017620000001040711553617364020531 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/vector/.svn/text-base/buts.cl.svn-base0000444000175600017620000010135711553617671020021 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // Serial tidy-up function. void buts_serial( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k, __const int cell) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = (starting_k - (kblock - 1)) + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (all(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vdouble vn0 = vload(0, v + h_below + 0 * m_offset); vdouble vn1 = vload(0, v + h_below + 1 * m_offset); vdouble vn2 = vload(0, v + h_below + 2 * m_offset); vdouble vn3 = vload(0, v + h_below + 3 * m_offset); vdouble vn4 = vload(0, v + h_below + 4 * m_offset); // Read in u neighbour, for calculation of c. vdouble u0 = vload(0, u + h_below + 0 * m_offset); vdouble u1 = vload(0, u + h_below + 1 * m_offset); vdouble u2 = vload(0, u + h_below + 2 * m_offset); vdouble u3 = vload(0, u + h_below + 3 * m_offset); vdouble u4 = vload(0, u + h_below + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_below + 0 * m_offset); vdouble v0 = omega * ( jacu_c_value_00 * vn0 ); vdouble v1 = omega * ( jacu_c_value_01 * vn0 ); vdouble v2 = omega * ( jacu_c_value_02 * vn0 ); vdouble v3 = omega * ( jacu_c_value_03 * vn0 ); vdouble v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = vload(0, v + h_below + 1 * m_offset); v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = vload(0, v + h_below + 2 * m_offset); v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = vload(0, v + h_below + 3 * m_offset); v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = vload(0, v + h_below + 4 * m_offset); v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_south + 0 * m_offset); vn1 = vload(0, v + h_south + 1 * m_offset); vn2 = vload(0, v + h_south + 2 * m_offset); vn3 = vload(0, v + h_south + 3 * m_offset); vn4 = vload(0, v + h_south + 4 * m_offset); // Read in u neighbour, for calculation of b. u0 = vload(0, u + h_south + 0 * m_offset); u1 = vload(0, u + h_south + 1 * m_offset); u2 = vload(0, u + h_south + 2 * m_offset); u3 = vload(0, u + h_south + 3 * m_offset); u4 = vload(0, u + h_south + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_south + 0 * m_offset); v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = vload(0, v + h_south + 1 * m_offset); v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = vload(0, v + h_south + 2 * m_offset); v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = vload(0, v + h_south + 3 * m_offset); v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = vload(0, v + h_south + 4 * m_offset); v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = vload(0, v + h_east + 0 * m_offset); vn1 = vload(0, v + h_east + 1 * m_offset); vn2 = vload(0, v + h_east + 2 * m_offset); vn3 = vload(0, v + h_east + 3 * m_offset); vn4 = vload(0, v + h_east + 4 * m_offset); // Read in u neighbour, for calculation of a. u0 = vload(0, u + h_east + 0 * m_offset); u1 = vload(0, u + h_east + 1 * m_offset); u2 = vload(0, u + h_east + 2 * m_offset); u3 = vload(0, u + h_east + 3 * m_offset); u4 = vload(0, u + h_east + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_east + 0 * m_offset); v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = vload(0, v + h_east + 1 * m_offset); v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = vload(0, v + h_east + 2 * m_offset); v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = vload(0, v + h_east + 3 * m_offset); v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = vload(0, v + h_east + 4 * m_offset); v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacu_d_value_00; vdouble tmat10 = jacu_d_value_10; vdouble tmat20 = jacu_d_value_20; vdouble tmat30 = jacu_d_value_30; vdouble tmat40 = jacu_d_value_40; vdouble tmat01 = jacu_d_value_01; vdouble tmat11 = jacu_d_value_11; vdouble tmat21 = jacu_d_value_21; vdouble tmat31 = jacu_d_value_31; vdouble tmat41 = jacu_d_value_41; vdouble tmat02 = jacu_d_value_02; vdouble tmat12 = jacu_d_value_12; vdouble tmat22 = jacu_d_value_22; vdouble tmat32 = jacu_d_value_32; vdouble tmat42 = jacu_d_value_42; vdouble tmat03 = jacu_d_value_03; vdouble tmat13 = jacu_d_value_13; vdouble tmat23 = jacu_d_value_23; vdouble tmat33 = jacu_d_value_33; vdouble tmat43 = jacu_d_value_43; vdouble tmat04 = jacu_d_value_04; vdouble tmat14 = jacu_d_value_14; vdouble tmat24 = jacu_d_value_24; vdouble tmat34 = jacu_d_value_34; vdouble tmat44 = jacu_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update v. vdouble old_v; old_v = vload(0, v + h_index + 0 * m_offset); old_v -= v0; vstore(old_v, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); old_v -= v1; vstore(old_v, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); old_v -= v2; vstore(old_v, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); old_v -= v3; vstore(old_v, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); old_v -= v4; vstore(old_v, 0, v + h_index + 4 * m_offset); } else if (any(b)) { int vcell; for (vcell = 0; vcell < vlength; vcell++) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell + vcell); } } } // Serial tidy-up. for (; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { buts_serial(v, u, wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping, wavefront, starting_k, cell); } } kernels/vector/.svn/text-base/blts.cl.svn-base0000444000175600017620000005225711553617672020015 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); int cell; //const int cellbound = (((isiz1 + 4) * (isiz2 + 4) - vlength)/vlength)*vlength; const int cellbound = (isiz1 + 4) * (isiz2 + 4); for (cell = (gid*vlength); cell < cellbound; cell += (threads*vlength)) { const vint iv = vload(0, columns + cell); const vint jv = vload(0, rows + cell); const vint kv = starting_k + (wavefront - (iv + jv)); const vint depthv = (wavefront - (iv + jv)); // Look at which elements of the vector need to be updated. vint b = (iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); // If they ALL need to be updated, do a "proper" vector op. if (any(b)) { const int i = columns[cell]; const int j = rows[cell]; const int k = starting_k + (wavefront - (i + j)); const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. vdouble v0 = vload(0, v + h_index + 0 * m_offset); vdouble v1 = vload(0, v + h_index + 1 * m_offset); vdouble v2 = vload(0, v + h_index + 2 * m_offset); vdouble v3 = vload(0, v + h_index + 3 * m_offset); vdouble v4 = vload(0, v + h_index + 4 * m_offset); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. vdouble u0 = vload(0, u + h_above + 0 * m_offset); vdouble u1 = vload(0, u + h_above + 1 * m_offset); vdouble u2 = vload(0, u + h_above + 2 * m_offset); vdouble u3 = vload(0, u + h_above + 3 * m_offset); vdouble u4 = vload(0, u + h_above + 4 * m_offset); // Compute some values based on u0. vdouble tmp1 = 1.0e+00 / u0; vdouble tmp2 = tmp1 * tmp1; vdouble tmp3 = tmp1 * tmp2; vdouble vn0 = vload(0, v + h_above + 0 * m_offset); v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); vdouble vn1 = vload(0, v + h_above + 1 * m_offset); v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); vdouble vn2 = vload(0, v + h_above + 2 * m_offset); v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); vdouble vn3 = vload(0, v + h_above + 3 * m_offset); v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); vdouble vn4 = vload(0, v + h_above + 4 * m_offset); v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = vload(0, u + h_north + 0 * m_offset); u1 = vload(0, u + h_north + 1 * m_offset); u2 = vload(0, u + h_north + 2 * m_offset); u3 = vload(0, u + h_north + 3 * m_offset); u4 = vload(0, u + h_north + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_north + 0 * m_offset); v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = vload(0, v + h_north + 1 * m_offset); v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = vload(0, v + h_north + 2 * m_offset); v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = vload(0, v + h_north + 3 * m_offset); v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = vload(0, v + h_north + 4 * m_offset); v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = vload(0, u + h_west + 0 * m_offset); u1 = vload(0, u + h_west + 1 * m_offset); u2 = vload(0, u + h_west + 2 * m_offset); u3 = vload(0, u + h_west + 3 * m_offset); u4 = vload(0, u + h_west + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = vload(0, v + h_west + 0 * m_offset); v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = vload(0, v + h_west + 1 * m_offset); v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = vload(0, v + h_west + 2 * m_offset); v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = vload(0, v + h_west + 3 * m_offset); v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = vload(0, v + h_west + 4 * m_offset); v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = vload(0, u + h_index + 0 * m_offset); u1 = vload(0, u + h_index + 1 * m_offset); u2 = vload(0, u + h_index + 2 * m_offset); u3 = vload(0, u + h_index + 3 * m_offset); u4 = vload(0, u + h_index + 4 * m_offset); // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vdouble tmat00 = jacld_d_value_00; vdouble tmat10 = jacld_d_value_10; vdouble tmat20 = jacld_d_value_20; vdouble tmat30 = jacld_d_value_30; vdouble tmat40 = jacld_d_value_40; vdouble tmat01 = jacld_d_value_01; vdouble tmat11 = jacld_d_value_11; vdouble tmat21 = jacld_d_value_21; vdouble tmat31 = jacld_d_value_31; vdouble tmat41 = jacld_d_value_41; vdouble tmat02 = jacld_d_value_02; vdouble tmat12 = jacld_d_value_12; vdouble tmat22 = jacld_d_value_22; vdouble tmat32 = jacld_d_value_32; vdouble tmat42 = jacld_d_value_42; vdouble tmat03 = jacld_d_value_03; vdouble tmat13 = jacld_d_value_13; vdouble tmat23 = jacld_d_value_23; vdouble tmat33 = jacld_d_value_33; vdouble tmat43 = jacld_d_value_43; vdouble tmat04 = jacld_d_value_04; vdouble tmat14 = jacld_d_value_14; vdouble tmat24 = jacld_d_value_24; vdouble tmat34 = jacld_d_value_34; vdouble tmat44 = jacld_d_value_44; // ip = 0. vdouble tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v3 -= tmat43 * v4; v3 /= tmat33; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; /*if (all(b)) { vstore(v0, 0, v + h_index + 0 * m_offset); vstore(v1, 0, v + h_index + 1 * m_offset); vstore(v2, 0, v + h_index + 2 * m_offset); vstore(v3, 0, v + h_index + 3 * m_offset); vstore(v4, 0, v + h_index + 4 * m_offset); //} else {*/ vlong b2 = convert_long2(iv >= ist && iv <= iend && jv >= jst && jv <= jend && kv >= 1 && kv <= nz - 2 && depthv >= 0 && depthv <= kblock - 1); vdouble old_v = vload(0, v + h_index + 0 * m_offset); v0 = select(old_v, v0, b2); vstore(v0, 0, v + h_index + 0 * m_offset); old_v = vload(0, v + h_index + 1 * m_offset); v1 = select(old_v, v1, b2); vstore(v1, 0, v + h_index + 1 * m_offset); old_v = vload(0, v + h_index + 2 * m_offset); v2 = select(old_v, v2, b2); vstore(v2, 0, v + h_index + 2 * m_offset); old_v = vload(0, v + h_index + 3 * m_offset); v3 = select(old_v, v3, b2); vstore(v3, 0, v + h_index + 3 * m_offset); old_v = vload(0, v + h_index + 4 * m_offset); v4 = select(old_v, v4, b2); vstore(v4, 0, v + h_index + 4 * m_offset); //} } } } kernels/vector/rhs/xi2/rhs_xi_dissipation.cl0000644000175600017620000000471711541636230017740 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/rhs/xi2/rhs_xi4.cl0000644000175600017620000000420511541636015015407 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector/rhs/xi2/rhs_xi3.cl0000644000175600017620000000362211541635736015421 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector/rhs/xi2/rhs_xi2.cl0000644000175600017620000000223511541635711015410 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/vector/rhs/xi2/rhs_xi1.cl0000644000175600017620000000317211553044741015410 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u21; int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } } } kernels/vector/rhs/xi/rhs_xi_dissipation.cl0000644000175600017620000000471711553620310017651 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/rhs/xi/rhs_xi4.cl0000644000175600017620000000420511553620310015317 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector/rhs/xi/rhs_xi3.cl0000644000175600017620000000362211553620310015320 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector/rhs/xi/rhs_xi2.cl0000644000175600017620000000223511553620310015316 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/vector/rhs/xi/rhs_xi1.cl0000644000175600017620000000317211553620310015316 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u21; int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } } } kernels/vector/rhs/zeta2/rhs_zeta_dissipation.cl0000644000175600017620000000466411541644534020615 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/rhs/zeta2/rhs_zeta4.cl0000644000175600017620000000436611541644475016276 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta2/rhs_zeta3.cl0000644000175600017620000000341211541644436016261 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector/rhs/zeta2/rhs_zeta2.cl0000644000175600017620000000257511541644407016267 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta2/rhs_zeta1.cl0000644000175600017620000000267711541644363016272 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector/rhs/eta2/rhs_eta_dissipation.cl0000644000175600017620000000527411541636665020235 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector/rhs/eta2/rhs_eta4.cl0000644000175600017620000000441311541636622015676 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector/rhs/eta2/rhs_eta3.cl0000644000175600017620000000356611541636574015713 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/vector/rhs/eta2/rhs_eta2.cl0000644000175600017620000000224611541636540015675 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/vector/rhs/eta2/rhs_eta1.cl0000644000175600017620000000277611541636504015704 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u31; int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Read in the u values. double u0 = u[tiled_index(k, j, i, 0)]; double u1 = u[tiled_index(k, j, i, 1)]; double u2 = u[tiled_index(k, j, i, 2)]; double u3 = u[tiled_index(k, j, i, 3)]; double u4 = u[tiled_index(k, j, i, 4)]; // Update flux. flux[tiled_index(k, j, i, 0)] = u2; u31 = u2 / u0; q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[tiled_index(k, j, i, 1)] = u1 * u31; flux[tiled_index(k, j, i, 2)] = u2 * u31 + c2 * ( u4 - q ); flux[tiled_index(k, j, i, 3)] = u3 * u31; flux[tiled_index(k, j, i, 4)] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/vector/rhs/zeta/rhs_zeta_dissipation.cl0000644000175600017620000000466411553620320020521 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/rhs/zeta/rhs_zeta4.cl0000644000175600017620000000436611553620320016176 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta/rhs_zeta3.cl0000644000175600017620000000341211553620320016164 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector/rhs/zeta/rhs_zeta2.cl0000644000175600017620000000257511553620320016174 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta/rhs_zeta1.cl0000644000175600017620000000267711553620320016176 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector/rhs/eta/rhs_eta_dissipation.cl0000644000175600017620000000527411553620314020136 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector/rhs/eta/rhs_eta4.cl0000644000175600017620000000441311553620314015606 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector/rhs/eta/rhs_eta3.cl0000644000175600017620000000356611553620314015615 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/vector/rhs/eta/rhs_eta2.cl0000644000175600017620000000224611553620314015606 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/vector/rhs/eta/rhs_eta1.cl0000644000175600017620000000277611553620314015615 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u31; int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Read in the u values. double u0 = u[tiled_index(k, j, i, 0)]; double u1 = u[tiled_index(k, j, i, 1)]; double u2 = u[tiled_index(k, j, i, 2)]; double u3 = u[tiled_index(k, j, i, 3)]; double u4 = u[tiled_index(k, j, i, 4)]; // Update flux. flux[tiled_index(k, j, i, 0)] = u2; u31 = u2 / u0; q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[tiled_index(k, j, i, 1)] = u1 * u31; flux[tiled_index(k, j, i, 2)] = u2 * u31 + c2 * ( u4 - q ); flux[tiled_index(k, j, i, 3)] = u3 * u31; flux[tiled_index(k, j, i, 4)] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/vector/rhs/.svn/entries0000444000175600017620000000145011753220711015257 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs svn://svn/perfmodelling 2011-04-20T18:08:11.549895Z 1212 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 xi dir rhs_eta.cl file 2011-04-20T18:07:30.000000Z a0204f729c54a6f786c1ae5c122259a4 2011-04-20T18:08:11.549895Z 1212 sjp 6491 zeta dir eta dir rhs_setup.cl file 2011-03-21T13:34:52.000000Z 15242aa7cbd07244691e0ca2c3126805 2011-04-20T18:02:45.237789Z 1211 sjp 913 rhs_xi.cl file 2011-04-18T14:45:57.000000Z 0834557a220d269fc67481837939e968 2011-04-20T18:08:11.549895Z 1212 sjp 7099 rhs_zeta.cl file 2011-04-20T18:07:31.000000Z 87b7096d0b520c15c94004f97ed70ee0 2011-04-20T18:08:11.549895Z 1212 sjp 7185 kernels/scalar/rhs/zeta/rhs_zeta_dissipation.cl0000644000175600017620000000466411553326133020471 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/scalar/rhs/zeta/rhs_zeta4.cl0000644000175600017620000000436611553326135016150 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/scalar/rhs/zeta/rhs_zeta3.cl0000644000175600017620000000341211553326140016132 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/scalar/rhs/zeta/rhs_zeta2.cl0000644000175600017620000000257511553326142016144 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/scalar/rhs/zeta/rhs_zeta1.cl0000644000175600017620000000267711553326153016150 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/scalar/rhs/xi/rhs_xi_dissipation.cl0000644000175600017620000000471711541636230017621 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/scalar/rhs/xi/rhs_xi4.cl0000644000175600017620000000420511541636015015270 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/scalar/rhs/xi/rhs_xi3.cl0000644000175600017620000000362211541635736015302 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/scalar/rhs/xi/rhs_xi2.cl0000644000175600017620000000223511541635711015271 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/scalar/rhs/xi/rhs_xi1.cl0000644000175600017620000000317211570737742015302 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u21; int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } } } kernels/scalar/rhs/eta/rhs_eta_dissipation.cl0000644000175600017620000000527411553326301020100 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/scalar/rhs/eta/rhs_eta4.cl0000644000175600017620000000441311553326275015562 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/scalar/rhs/eta/rhs_eta3.cl0000644000175600017620000000356611553326273015567 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/scalar/rhs/eta/rhs_eta2.cl0000644000175600017620000000224611553326271015556 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/scalar/rhs/eta/rhs_eta1.cl0000644000175600017620000000277611553326267015572 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u31; int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Read in the u values. double u0 = u[tiled_index(k, j, i, 0)]; double u1 = u[tiled_index(k, j, i, 1)]; double u2 = u[tiled_index(k, j, i, 2)]; double u3 = u[tiled_index(k, j, i, 3)]; double u4 = u[tiled_index(k, j, i, 4)]; // Update flux. flux[tiled_index(k, j, i, 0)] = u2; u31 = u2 / u0; q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[tiled_index(k, j, i, 1)] = u1 * u31; flux[tiled_index(k, j, i, 2)] = u2 * u31 + c2 * ( u4 - q ); flux[tiled_index(k, j, i, 3)] = u3 * u31; flux[tiled_index(k, j, i, 4)] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/scalar/rhs/.svn/entries0000444000175600017620000000145011753220711015222 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/scalar/rhs svn://svn/perfmodelling 2011-04-20T18:02:45.237789Z 1211 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 xi dir rhs_eta.cl file 2011-04-19T15:48:43.000000Z a0204f729c54a6f786c1ae5c122259a4 2011-04-20T18:02:45.237789Z 1211 sjp 6491 zeta dir rhs_setup.cl file 2011-03-21T13:34:52.000000Z 15242aa7cbd07244691e0ca2c3126805 2011-03-23T14:53:37.138628Z 1172 sjp 913 eta dir rhs_xi.cl file 2011-04-19T15:22:19.000000Z c56c774edd4c1a76e854ccf8d6a63dac 2011-04-20T18:02:45.237789Z 1211 sjp 6113 rhs_zeta.cl file 2011-04-19T16:14:06.000000Z 87b7096d0b520c15c94004f97ed70ee0 2011-04-20T18:02:45.237789Z 1211 sjp 7185 kernels/scalar/.svn/text-base/rearrangement.cl.svn-base0000444000175600017620000002046511542404555021632 0ustar sjpsjp/** * Kernel to replace the "memset" functionality of CUDA. */ __kernel void memset_double_kernel( __global double* buffer, __const double value, __const int number) { // Determine thread indices. const int tid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = tid; cell <= number; cell += threads) { buffer[cell] = value; } } /** * Shift from flat to hyperplane layout. */ __kernel void flat_to_hyperplane_kernel( __global const double* flat_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to flat layout. */ __kernel void hyperplane_to_flat_kernel( __global const double* hyperplane_input, __global double* flat_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; flat_output[flat_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } /** * Shift from flat to tiled layout. */ __kernel void flat_to_tiled_kernel( __global const double* flat_input, __global double* tiled_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = flat_input[flat_index(k, j, i, 0)]; tiled_output[tiled_index(k, j, i, 1)] = flat_input[flat_index(k, j, i, 1)]; tiled_output[tiled_index(k, j, i, 2)] = flat_input[flat_index(k, j, i, 2)]; tiled_output[tiled_index(k, j, i, 3)] = flat_input[flat_index(k, j, i, 3)]; tiled_output[tiled_index(k, j, i, 4)] = flat_input[flat_index(k, j, i, 4)]; } } } } /** * Shift from tiled to flat layout. */ __kernel void tiled_to_flat_kernel( __global const double* tiled_input, __global double* flat_output) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { flat_output[flat_index(k, j, i, 0)] = tiled_input[tiled_index(k, j, i, 0)]; flat_output[flat_index(k, j, i, 1)] = tiled_input[tiled_index(k, j, i, 1)]; flat_output[flat_index(k, j, i, 2)] = tiled_input[tiled_index(k, j, i, 2)]; flat_output[flat_index(k, j, i, 3)] = tiled_input[tiled_index(k, j, i, 3)]; flat_output[flat_index(k, j, i, 4)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from tiled to hyperplane layout. */ __kernel void tiled_to_hyperplane_kernel( __global const double* tiled_input, __global double* hyperplane_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { hyperplane_output[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 0)]; hyperplane_output[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 1)]; hyperplane_output[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 2)]; hyperplane_output[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 3)]; hyperplane_output[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)] = tiled_input[tiled_index(k, j, i, 4)]; } } } } /** * Shift from hyperplane to tiled layout. */ __kernel void hyperplane_to_tiled_kernel( __global const double* hyperplane_input, __global double* tiled_output, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k < nz; k += ksize) { for (j = 0 + jid; j < (isiz2 + 4); j += jsize) { for (i = 0 + iid; i < (isiz1 + 4); i += isize) { tiled_output[tiled_index(k, j, i, 0)] = hyperplane_input[hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 1)] = hyperplane_input[hyperplane_index(k, j, i, 1, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 2)] = hyperplane_input[hyperplane_index(k, j, i, 2, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 3)] = hyperplane_input[hyperplane_index(k, j, i, 3, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; tiled_output[tiled_index(k, j, i, 4)] = hyperplane_input[hyperplane_index(k, j, i, 4, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping)]; } } } } kernels/scalar/.svn/text-base/print.cl.svn-base0000444000175600017620000000061411542404557020130 0ustar sjpsjp/** * A bunch of utility kernels for printing the contents of cl_mem objects. */ __kernel void print_mem_kernel(__global double* memory, const int n) { // Force this to be printed serially. int tid = get_global_id(0); if (tid == 0) { int i; printf("{"); for (i = 0; i < n; i++) { printf("%f", memory[i]); if (i != n-1) { printf(", "); } } printf("}\n"); } } kernels/scalar/.svn/text-base/pre.cl.svn-base0000444000175600017620000000135211542404556017561 0ustar sjpsjp// OpenCL kernel for preprocessing step. __kernel void pre_kernel( __global double* rsd) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] *= dt; rsd[tiled_index(k, j, i, 1)] *= dt; rsd[tiled_index(k, j, i, 2)] *= dt; rsd[tiled_index(k, j, i, 3)] *= dt; rsd[tiled_index(k, j, i, 4)] *= dt; } } } } kernels/scalar/.svn/text-base/post.cl.svn-base0000444000175600017620000000166111542404556017763 0ustar sjpsjp// OpenCL kernel for postprocessing step. __kernel void post_kernel( __global double* u, __global const double* rsd, __const double tmp) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { u[tiled_index(k, j, i, 0)] += tmp * rsd[tiled_index(k, j, i, 0)]; u[tiled_index(k, j, i, 1)] += tmp * rsd[tiled_index(k, j, i, 1)]; u[tiled_index(k, j, i, 2)] += tmp * rsd[tiled_index(k, j, i, 2)]; u[tiled_index(k, j, i, 3)] += tmp * rsd[tiled_index(k, j, i, 3)]; u[tiled_index(k, j, i, 4)] += tmp * rsd[tiled_index(k, j, i, 4)]; } } } } kernels/scalar/.svn/text-base/l2norm.cl.svn-base0000444000175600017620000000072311542404557020206 0ustar sjpsjp// OpenCL kernel for l2norm. __kernel void l2norm_kernel( __global const double* rsd, __global double* sum, __const int nz0) { // Compute thread id. int m = get_global_id(0); double lsum = 0.0e+00; // Compute the sum for this m. int k, j, i; for (k = 1; k <= nz0 - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { double v = rsd[tiled_index(k, j, i, m)]; lsum += v * v; } } } sum[m] = lsum; } kernels/scalar/.svn/text-base/ex3_unpack.cl.svn-base0000444000175600017620000001044411542631334021031 0ustar sjpsjp// Unpacks buf1 into g. __kernel void ex3_unpack_north_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, 0, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, 0, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, 0, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, 0, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, 0, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, 1, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, 1, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, 1, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, 1, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, 1, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_south_kernel ( __global const double* buf1, __global double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; g[tiled_index(k, j, nx + 3, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, j, nx + 3, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, j, nx + 3, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, j, nx + 3, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, j, nx + 3, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, j, nx + 2, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, j, nx + 2, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, j, nx + 2, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, j, nx + 2, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, j, nx + 2, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_west_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, 0, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, 0, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, 0, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, 0, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, 0, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, 1, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, 1, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, 1, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, 1, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, 1, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } // Unpacks buf1 into g. __kernel void ex3_unpack_east_kernel ( __global const double* buf1, __global double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; g[tiled_index(k, ny + 3, i, 0)] = buf1[(ipos1 * 5) + 0]; g[tiled_index(k, ny + 3, i, 1)] = buf1[(ipos1 * 5) + 1]; g[tiled_index(k, ny + 3, i, 2)] = buf1[(ipos1 * 5) + 2]; g[tiled_index(k, ny + 3, i, 3)] = buf1[(ipos1 * 5) + 3]; g[tiled_index(k, ny + 3, i, 4)] = buf1[(ipos1 * 5) + 4]; g[tiled_index(k, ny + 2, i, 0)] = buf1[(ipos2 * 5) + 0]; g[tiled_index(k, ny + 2, i, 1)] = buf1[(ipos2 * 5) + 1]; g[tiled_index(k, ny + 2, i, 2)] = buf1[(ipos2 * 5) + 2]; g[tiled_index(k, ny + 2, i, 3)] = buf1[(ipos2 * 5) + 3]; g[tiled_index(k, ny + 2, i, 4)] = buf1[(ipos2 * 5) + 4]; } } } kernels/scalar/.svn/text-base/ex3_pack.cl.svn-base0000444000175600017620000001031111542631334020457 0ustar sjpsjp// Packs g into buf. __kernel void ex3_pack_south_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, nx, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, nx, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, nx, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, nx, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, nx, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, nx + 1, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, nx + 1, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, nx + 1, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, nx + 1, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, nx + 1, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_north_kernel ( __global double* buf, __global const double* g) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int kid = get_global_id(1); const int jsize = get_global_size(0); const int ksize = get_global_size(1); int j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ipos1 = k * ny + j - 2; const int ipos2 = ipos1 + ny * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, j, 3, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, j, 3, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, j, 3, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, j, 3, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, j, 3, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, j, 2, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, j, 2, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, j, 2, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, j, 2, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, j, 2, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_east_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, ny, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, ny, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, ny, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, ny, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, ny, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, ny + 1, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, ny + 1, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, ny + 1, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, ny + 1, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, ny + 1, i, 4)]; } } } // Packs g into buf. __kernel void ex3_pack_west_kernel ( __global double* buf, __global const double* g) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int kid = get_global_id(1); const int isize = get_global_size(0); const int ksize = get_global_size(1); int i, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (i = 2 + iid; i <= nx + 1; i += isize) { const int ipos1 = k * nx + i - 2; const int ipos2 = ipos1 + nx * nz; buf[(ipos1 * 5) + 0] = g[tiled_index(k, 3, i, 0)]; buf[(ipos1 * 5) + 1] = g[tiled_index(k, 3, i, 1)]; buf[(ipos1 * 5) + 2] = g[tiled_index(k, 3, i, 2)]; buf[(ipos1 * 5) + 3] = g[tiled_index(k, 3, i, 3)]; buf[(ipos1 * 5) + 4] = g[tiled_index(k, 3, i, 4)]; buf[(ipos2 * 5) + 0] = g[tiled_index(k, 2, i, 0)]; buf[(ipos2 * 5) + 1] = g[tiled_index(k, 2, i, 1)]; buf[(ipos2 * 5) + 2] = g[tiled_index(k, 2, i, 2)]; buf[(ipos2 * 5) + 3] = g[tiled_index(k, 2, i, 3)]; buf[(ipos2 * 5) + 4] = g[tiled_index(k, 2, i, 4)]; } } } kernels/scalar/.svn/text-base/ex1_unpack.cl.svn-base0000444000175600017620000001043111542650442021024 0ustar sjpsjp// Unpacks jrecv into g. __kernel void ex1_unpack_north_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_west_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } // Unpacks jrecv into g. __kernel void ex1_unpack_south_kernel( __global const double* jrecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); g[h_index + 0 * m_offset] = jrecv[b_index + 0]; g[h_index + 1 * m_offset] = jrecv[b_index + 1]; g[h_index + 2 * m_offset] = jrecv[b_index + 2]; g[h_index + 3 * m_offset] = jrecv[b_index + 3]; g[h_index + 4 * m_offset] = jrecv[b_index + 4]; } } } } // Unpacks irecv into g. __kernel void ex1_unpack_east_kernel( __global const double* irecv, __global double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); g[h_index + 0 * m_offset] = irecv[b_index + 0]; g[h_index + 1 * m_offset] = irecv[b_index + 1]; g[h_index + 2 * m_offset] = irecv[b_index + 2]; g[h_index + 3 * m_offset] = irecv[b_index + 3]; g[h_index + 4 * m_offset] = irecv[b_index + 4]; } } } } kernels/scalar/.svn/text-base/ex1_pack.cl.svn-base0000444000175600017620000001040711542650442020464 0ustar sjpsjp// Packs jsend into g. __kernel void ex1_pack_south_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z - (kblock - 1), j, nx + 1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_east_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z - (kblock - 1), ny + 1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs jsend into g. __kernel void ex1_pack_north_kernel( __global double* jsend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate j and z values for loops. const int jid = get_global_id(0); const int zid = get_global_id(1); const int jsize = get_global_size(0); const int zsize = get_global_size(1); int j, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst + jid; j <= jend; j += jsize) { int h_index = hyperplane_index(k + z, j, 2, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (jend - jst + 1) * 5) + ((j - jst) * 5); jsend[b_index + 0] = g[h_index + 0 * m_offset]; jsend[b_index + 1] = g[h_index + 1 * m_offset]; jsend[b_index + 2] = g[h_index + 2 * m_offset]; jsend[b_index + 3] = g[h_index + 3 * m_offset]; jsend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } // Packs isend into g. __kernel void ex1_pack_west_kernel( __global double* isend, __global const double* g, __const int k, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* thread_mapping) { // Calculate i and z values for loops. const int iid = get_global_id(0); const int zid = get_global_id(1); const int isize = get_global_size(0); const int zsize = get_global_size(1); int i, z; for (z = 0 + zid; z < kblock; z += zsize) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist + iid; i <= iend; i += isize) { int h_index = hyperplane_index(k + z, 2, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); int b_index = (z * (iend - ist + 1) * 5) + ((i - ist) * 5); isend[b_index + 0] = g[h_index + 0 * m_offset]; isend[b_index + 1] = g[h_index + 1 * m_offset]; isend[b_index + 2] = g[h_index + 2 * m_offset]; isend[b_index + 3] = g[h_index + 3 * m_offset]; isend[b_index + 4] = g[h_index + 4 * m_offset]; } } } } kernels/scalar/.svn/text-base/buts.cl.svn-base0000444000175600017620000005063711545060262017755 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacu_a_value_00 (-dt * tx1 * dx1) #define jacu_a_value_10 (dt * tx2) #define jacu_a_value_20 (0.0e+00) #define jacu_a_value_30 (0.0e+00) #define jacu_a_value_40 (0.0e+00) #define jacu_a_value_01 (dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( - r43 * c34 * tmp2 * u1 )) #define jacu_a_value_11 (dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacu_a_value_21 (dt * tx2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_a_value_31 (dt * tx2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_a_value_41 (dt * tx2 * c2) #define jacu_a_value_02 (dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacu_a_value_12 (dt * tx2 * ( u2 * tmp1 )) #define jacu_a_value_22 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx3) #define jacu_a_value_32 (0.0e+00) #define jacu_a_value_42 (0.0e+00) #define jacu_a_value_03 (dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacu_a_value_13 (dt * tx2 * ( u3 * tmp1 )) #define jacu_a_value_23 (0.0e+00) #define jacu_a_value_33 (dt * tx2 * ( u1 * tmp1 ) -dt * tx1 * ( c34 * tmp1 ) -dt * tx1 * dx4) #define jacu_a_value_43 (0.0e+00) #define jacu_a_value_04 (dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_a_value_14 (dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacu_a_value_24 (dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) -dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_a_value_34 (dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_a_value_44 (dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacu_b_value_00 (-dt * ty1 * dy1) #define jacu_b_value_10 (0.0e+00) #define jacu_b_value_20 (dt * ty2) #define jacu_b_value_30 (0.0e+00) #define jacu_b_value_40 (0.0e+00) #define jacu_b_value_01 (dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacu_b_value_11 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacu_b_value_21 (dt * ty2 * ( u1 * tmp1 )) #define jacu_b_value_31 (0.0e+00) #define jacu_b_value_41 (0.0e+00) #define jacu_b_value_02 (dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( - r43 * c34 * tmp2 * u2 )) #define jacu_b_value_12 (dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_b_value_22 (dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacu_b_value_32 (dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacu_b_value_42 (dt * ty2 * c2) #define jacu_b_value_03 (dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u3 )) #define jacu_b_value_13 (0.0e+00) #define jacu_b_value_23 (dt * ty2 * ( u3 * tmp1 )) #define jacu_b_value_33 (dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacu_b_value_43 (0.0e+00) #define jacu_b_value_04 (dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_b_value_14 (dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_b_value_24 (dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacu_b_value_34 (dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacu_b_value_44 (dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacu_c_value_00 (-dt * tz1 * dz1) #define jacu_c_value_10 (0.0e+00) #define jacu_c_value_20 (0.0e+00) #define jacu_c_value_30 (dt * tz2) #define jacu_c_value_40 (0.0e+00) #define jacu_c_value_01 (dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacu_c_value_11 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacu_c_value_21 (0.0e+00) #define jacu_c_value_31 (dt * tz2 * ( u1 * tmp1 )) #define jacu_c_value_41 (0.0e+00) #define jacu_c_value_02 (dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u2 )) #define jacu_c_value_12 (0.0e+00) #define jacu_c_value_22 (dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacu_c_value_32 (dt * tz2 * ( u2 * tmp1 )) #define jacu_c_value_42 (0.0e+00) #define jacu_c_value_03 (dt * tz2 * ( - ( u3 * tmp1 ) * ( u3 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( - r43 * c34 * tmp2 * u3 )) #define jacu_c_value_13 (dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacu_c_value_23 (dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacu_c_value_33 (dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacu_c_value_43 (dt * tz2 * c2) #define jacu_c_value_04 (dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacu_c_value_14 (dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacu_c_value_24 (dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacu_c_value_34 (dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacu_c_value_44 (dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacu_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacu_d_value_10 (0.0e+00) #define jacu_d_value_20 (0.0e+00) #define jacu_d_value_30 (0.0e+00) #define jacu_d_value_40 (0.0e+00) #define jacu_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacu_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacu_d_value_21 (0.0e+00) #define jacu_d_value_31 (0.0e+00) #define jacu_d_value_41 (0.0e+00) #define jacu_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacu_d_value_12 (0.0e+00) #define jacu_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacu_d_value_32 (0.0e+00) #define jacu_d_value_42 (0.0e+00) #define jacu_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacu_d_value_13 (0.0e+00) #define jacu_d_value_23 (0.0e+00) #define jacu_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacu_d_value_43 (0.0e+00) #define jacu_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacu_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacu_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacu_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacu_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for buts step. __kernel void buts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = (starting_k - (kblock -1)) + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k+1, j, i). int h_below = hyperplane_index(k+1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. double vn0 = v[h_below + 0 * m_offset]; double vn1 = v[h_below + 1 * m_offset]; double vn2 = v[h_below + 2 * m_offset]; double vn3 = v[h_below + 3 * m_offset]; double vn4 = v[h_below + 4 * m_offset]; // Read in u neighbour, for calculation of c. double u0 = u[h_below + 0 * m_offset]; double u1 = u[h_below + 1 * m_offset]; double u2 = u[h_below + 2 * m_offset]; double u3 = u[h_below + 3 * m_offset]; double u4 = u[h_below + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; vn0 = v[h_below + 0 * m_offset]; double v0 = omega * ( jacu_c_value_00 * vn0 ); double v1 = omega * ( jacu_c_value_01 * vn0 ); double v2 = omega * ( jacu_c_value_02 * vn0 ); double v3 = omega * ( jacu_c_value_03 * vn0 ); double v4 = omega * ( jacu_c_value_04 * vn0 ); vn1 = v[h_below + 1 * m_offset]; v0 = v0 + omega * ( jacu_c_value_10 * vn1 ); v1 = v1 + omega * ( jacu_c_value_11 * vn1 ); v2 = v2 + omega * ( jacu_c_value_12 * vn1 ); v3 = v3 + omega * ( jacu_c_value_13 * vn1 ); v4 = v4 + omega * ( jacu_c_value_14 * vn1 ); vn2 = v[h_below + 2 * m_offset]; v0 = v0 + omega * ( jacu_c_value_20 * vn2 ); v1 = v1 + omega * ( jacu_c_value_21 * vn2 ); v2 = v2 + omega * ( jacu_c_value_22 * vn2 ); v3 = v3 + omega * ( jacu_c_value_23 * vn2 ); v4 = v4 + omega * ( jacu_c_value_24 * vn2 ); vn3 = v[h_below + 3 * m_offset]; v0 = v0 + omega * ( jacu_c_value_30 * vn3 ); v1 = v1 + omega * ( jacu_c_value_31 * vn3 ); v2 = v2 + omega * ( jacu_c_value_32 * vn3 ); v3 = v3 + omega * ( jacu_c_value_33 * vn3 ); v4 = v4 + omega * ( jacu_c_value_34 * vn3 ); vn4 = v[h_below + 4 * m_offset]; v0 = v0 + omega * ( jacu_c_value_40 * vn4 ); v1 = v1 + omega * ( jacu_c_value_41 * vn4 ); v2 = v2 + omega * ( jacu_c_value_42 * vn4 ); v3 = v3 + omega * ( jacu_c_value_43 * vn4 ); v4 = v4 + omega * ( jacu_c_value_44 * vn4 ); // Update the values of v based on its neighbours in the j direction. int h_south = hyperplane_index(k, j+1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_south + 0 * m_offset]; vn1 = v[h_south + 1 * m_offset]; vn2 = v[h_south + 2 * m_offset]; vn3 = v[h_south + 3 * m_offset]; vn4 = v[h_south + 4 * m_offset]; // Read in u neighbour, for calculation of b. u0 = u[h_south + 0 * m_offset]; u1 = u[h_south + 1 * m_offset]; u2 = u[h_south + 2 * m_offset]; u3 = u[h_south + 3 * m_offset]; u4 = u[h_south + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_south + 0 * m_offset]; v0 = v0 + omega * ( jacu_b_value_00 * vn0 ); v1 = v1 + omega * ( jacu_b_value_01 * vn0 ); v2 = v2 + omega * ( jacu_b_value_02 * vn0 ); v3 = v3 + omega * ( jacu_b_value_03 * vn0 ); v4 = v4 + omega * ( jacu_b_value_04 * vn0 ); vn1 = v[h_south + 1 * m_offset]; v0 = v0 + omega * ( jacu_b_value_10 * vn1 ); v1 = v1 + omega * ( jacu_b_value_11 * vn1 ); v2 = v2 + omega * ( jacu_b_value_12 * vn1 ); v3 = v3 + omega * ( jacu_b_value_13 * vn1 ); v4 = v4 + omega * ( jacu_b_value_14 * vn1 ); vn2 = v[h_south + 2 * m_offset]; v0 = v0 + omega * ( jacu_b_value_20 * vn2 ); v1 = v1 + omega * ( jacu_b_value_21 * vn2 ); v2 = v2 + omega * ( jacu_b_value_22 * vn2 ); v3 = v3 + omega * ( jacu_b_value_23 * vn2 ); v4 = v4 + omega * ( jacu_b_value_24 * vn2 ); vn3 = v[h_south + 3 * m_offset]; v0 = v0 + omega * ( jacu_b_value_30 * vn3 ); v1 = v1 + omega * ( jacu_b_value_31 * vn3 ); v2 = v2 + omega * ( jacu_b_value_32 * vn3 ); v3 = v3 + omega * ( jacu_b_value_33 * vn3 ); v4 = v4 + omega * ( jacu_b_value_34 * vn3 ); vn4 = v[h_south + 4 * m_offset]; v0 = v0 + omega * ( jacu_b_value_40 * vn4 ); v1 = v1 + omega * ( jacu_b_value_41 * vn4 ); v2 = v2 + omega * ( jacu_b_value_42 * vn4 ); v3 = v3 + omega * ( jacu_b_value_43 * vn4 ); v4 = v4 + omega * ( jacu_b_value_44 * vn4 ); // Update the values of v based on its neighbours in the i direction. // Calculate the index of (k, j, i+1). int h_east = hyperplane_index(k, j, i+1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in v neighbour. vn0 = v[h_east + 0 * m_offset]; vn1 = v[h_east + 1 * m_offset]; vn2 = v[h_east + 2 * m_offset]; vn3 = v[h_east + 3 * m_offset]; vn4 = v[h_east + 4 * m_offset]; // Read in u neighbour, for calculation of a. u0 = u[h_east + 0 * m_offset]; u1 = u[h_east + 1 * m_offset]; u2 = u[h_east + 2 * m_offset]; u3 = u[h_east + 3 * m_offset]; u4 = u[h_east + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_east + 0 * m_offset]; v0 = v0 + omega * ( jacu_a_value_00 * vn0 ); v1 = v1 + omega * ( jacu_a_value_01 * vn0 ); v2 = v2 + omega * ( jacu_a_value_02 * vn0 ); v3 = v3 + omega * ( jacu_a_value_03 * vn0 ); v4 = v4 + omega * ( jacu_a_value_04 * vn0 ); vn1 = v[h_east + 1 * m_offset]; v0 = v0 + omega * ( jacu_a_value_10 * vn1 ); v1 = v1 + omega * ( jacu_a_value_11 * vn1 ); v2 = v2 + omega * ( jacu_a_value_12 * vn1 ); v3 = v3 + omega * ( jacu_a_value_13 * vn1 ); v4 = v4 + omega * ( jacu_a_value_14 * vn1 ); vn2 = v[h_east + 2 * m_offset]; v0 = v0 + omega * ( jacu_a_value_20 * vn2 ); v1 = v1 + omega * ( jacu_a_value_21 * vn2 ); v2 = v2 + omega * ( jacu_a_value_22 * vn2 ); v3 = v3 + omega * ( jacu_a_value_23 * vn2 ); v4 = v4 + omega * ( jacu_a_value_24 * vn2 ); vn3 = v[h_east + 3 * m_offset]; v0 = v0 + omega * ( jacu_a_value_30 * vn3 ); v1 = v1 + omega * ( jacu_a_value_31 * vn3 ); v2 = v2 + omega * ( jacu_a_value_32 * vn3 ); v3 = v3 + omega * ( jacu_a_value_33 * vn3 ); v4 = v4 + omega * ( jacu_a_value_34 * vn3 ); vn4 = v[h_east + 4 * m_offset]; v0 = v0 + omega * ( jacu_a_value_40 * vn4 ); v1 = v1 + omega * ( jacu_a_value_41 * vn4 ); v2 = v2 + omega * ( jacu_a_value_42 * vn4 ); v3 = v3 + omega * ( jacu_a_value_43 * vn4 ); v4 = v4 + omega * ( jacu_a_value_44 * vn4 ); /** * Diagonal block inversion. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacu_d_value_00; double tmat10 = jacu_d_value_10; double tmat20 = jacu_d_value_20; double tmat30 = jacu_d_value_30; double tmat40 = jacu_d_value_40; double tmat01 = jacu_d_value_01; double tmat11 = jacu_d_value_11; double tmat21 = jacu_d_value_21; double tmat31 = jacu_d_value_31; double tmat41 = jacu_d_value_41; double tmat02 = jacu_d_value_02; double tmat12 = jacu_d_value_12; double tmat22 = jacu_d_value_22; double tmat32 = jacu_d_value_32; double tmat42 = jacu_d_value_42; double tmat03 = jacu_d_value_03; double tmat13 = jacu_d_value_13; double tmat23 = jacu_d_value_23; double tmat33 = jacu_d_value_33; double tmat43 = jacu_d_value_43; double tmat04 = jacu_d_value_04; double tmat14 = jacu_d_value_14; double tmat24 = jacu_d_value_24; double tmat34 = jacu_d_value_34; double tmat44 = jacu_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 = tmat11 - tmp * tmat10; tmat21 = tmat21 - tmp * tmat20; tmat31 = tmat31 - tmp * tmat30; tmat41 = tmat41 - tmp * tmat40; v1 = v1 - v0 * tmp; tmp = tmp1 * tmat02; tmat12 = tmat12 - tmp * tmat10; tmat22 = tmat22 - tmp * tmat20; tmat32 = tmat32 - tmp * tmat30; tmat42 = tmat42 - tmp * tmat40; v2 = v2 - v0 * tmp; tmp = tmp1 * tmat03; tmat13 = tmat13 - tmp * tmat10; tmat23 = tmat23 - tmp * tmat20; tmat33 = tmat33 - tmp * tmat30; tmat43 = tmat43 - tmp * tmat40; v3 = v3 - v0 * tmp; tmp = tmp1 * tmat04; tmat14 = tmat14 - tmp * tmat10; tmat24 = tmat24 - tmp * tmat20; tmat34 = tmat34 - tmp * tmat30; tmat44 = tmat44 - tmp * tmat40; v4 = v4 - v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 = tmat22 - tmp * tmat21; tmat32 = tmat32 - tmp * tmat31; tmat42 = tmat42 - tmp * tmat41; v2 = v2 - v1 * tmp; tmp = tmp1 * tmat13; tmat23 = tmat23 - tmp * tmat21; tmat33 = tmat33 - tmp * tmat31; tmat43 = tmat43 - tmp * tmat41; v3 = v3 - v1 * tmp; tmp = tmp1 * tmat14; tmat24 = tmat24 - tmp * tmat21; tmat34 = tmat34 - tmp * tmat31; tmat44 = tmat44 - tmp * tmat41; v4 = v4 - v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 = tmat33 - tmp * tmat32; tmat43 = tmat43 - tmp * tmat42; v3 = v3 - v2 * tmp; tmp = tmp1 * tmat24; tmat34 = tmat34 - tmp * tmat32; tmat44 = tmat44 - tmp * tmat42; v4 = v4 - v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 = tmat44 - tmp * tmat43; v4 = v4 - v3 * tmp; /** * Back substitution. */ v4 = v4 / tmat44; v3 = v3 - tmat43 * v4; v3 = v3 / tmat33; v2 = v2 - tmat32 * v3 - tmat42 * v4; v2 = v2 / tmat22; v1 = v1 - tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 = v1 / tmat11; v0 = v0 - tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 = v0 / tmat00; // Update the values of v. v[h_index + 0 * m_offset] -= v0; v[h_index + 1 * m_offset] -= v1; v[h_index + 2 * m_offset] -= v2; v[h_index + 3 * m_offset] -= v3; v[h_index + 4 * m_offset] -= v4; } } } kernels/scalar/.svn/text-base/blts.cl.svn-base0000444000175600017620000004725511545060262017746 0ustar sjpsjp// Macro. #define c1 c1_def #define c2 c2_def #define c3 c3_def #define c4 c4_def #define c5 c5_def // Sparse matrix definitions. #define jacld_a_value_00 (-dt * tz1 * dz1) #define jacld_a_value_10 (0.0e+00) #define jacld_a_value_20 (0.0e+00) #define jacld_a_value_30 (-dt * tz2) #define jacld_a_value_40 (0.0e+00) #define jacld_a_value_01 (-dt * tz2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( - c34 * tmp2 * u1 )) #define jacld_a_value_11 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * c34 * tmp1 - dt * tz1 * dz2) #define jacld_a_value_21 (0.0e+00) #define jacld_a_value_31 (-dt * tz2 * ( u1 * tmp1 )) #define jacld_a_value_41 (0.0e+00) #define jacld_a_value_02 (-dt * tz2 * ( - ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( -c34 * tmp2 * u2 )) #define jacld_a_value_12 (0.0e+00) #define jacld_a_value_22 (-dt * tz2 * ( u3 * tmp1 ) - dt * tz1 * ( c34 * tmp1 ) - dt * tz1 * dz3) #define jacld_a_value_32 (-dt * tz2 * ( u2 * tmp1 )) #define jacld_a_value_42 (0.0e+00) #define jacld_a_value_03 (-dt * tz2 * ( - (( u3 * tmp1 ) * ( u3 * tmp1 )) + 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( -r43 * c34 * tmp2 * u3 )) #define jacld_a_value_13 (-dt * tz2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_a_value_23 (-dt * tz2 * ( - c2 * ( u2 * tmp1 ) )) #define jacld_a_value_33 (-dt * tz2 * ( 2.0e+00 - c2 ) * ( u3 * tmp1 ) - dt * tz1 * ( r43 * c34 * tmp1 ) - dt * tz1 * dz4) #define jacld_a_value_43 (-dt * tz2 * c2) #define jacld_a_value_04 (-dt * tz2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u3 * tmp1 ) ) - dt * tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_a_value_14 (-dt * tz2 * ( - c2 * ( u1 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_a_value_24 (-dt * tz2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * tz1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_a_value_34 (-dt * tz2 * ( c1 * ( u4 * tmp1 ) - 0.50e+00 * c2 * ( ( u1 * u1 + u2 * u2 + 3.0e+00 * u3 * u3 ) * tmp2 ) ) - dt * tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3) #define jacld_a_value_44 (-dt * tz2 * ( c1 * ( u3 * tmp1 ) ) - dt * tz1 * c1345 * tmp1 - dt * tz1 * dz5) #define jacld_b_value_00 (-dt * ty1 * dy1) #define jacld_b_value_10 (0) #define jacld_b_value_20 (-dt * ty2) #define jacld_b_value_30 (0) #define jacld_b_value_40 (0) #define jacld_b_value_01 (-dt * ty2 * ( - ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( - c34 * tmp2 * u1 )) #define jacld_b_value_11 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy2) #define jacld_b_value_21 (-dt * ty2 * ( u1 * tmp1 )) #define jacld_b_value_31 (0) #define jacld_b_value_41 (0) #define jacld_b_value_02 (-dt * ty2 * ( - ( u2 * tmp1 ) * ( u2 * tmp1 ) + 0.5e+00 * c2 * ( ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( -r43 * c34 * tmp2 * u2 )) #define jacld_b_value_12 (-dt * ty2 * ( - c2 * ( u1 * tmp1 ) )) #define jacld_b_value_22 (-dt * ty2 * ( ( 2.0e+00 - c2 ) * ( u2 * tmp1 ) ) - dt * ty1 * ( r43 * c34 * tmp1 ) - dt * ty1 * dy3) #define jacld_b_value_32 (-dt * ty2 * ( - c2 * ( u3 * tmp1 ) )) #define jacld_b_value_42 (-dt * ty2 * c2) #define jacld_b_value_03 (-dt * ty2 * ( - ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( -c34 * tmp2 * u3 )) #define jacld_b_value_13 (0) #define jacld_b_value_23 (-dt * ty2 * ( u3 * tmp1 )) #define jacld_b_value_33 (-dt * ty2 * ( u2 * tmp1 ) - dt * ty1 * ( c34 * tmp1 ) - dt * ty1 * dy4) #define jacld_b_value_43 (0) #define jacld_b_value_04 (-dt * ty2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u2 * tmp1 ) ) - dt * ty1 * ( - ( c34 - c1345 ) * tmp3 * u1 * u1 - ( r43 * c34 - c1345 ) * tmp3 * u2 * u2 - ( c34 - c1345 ) * tmp3 * u3 * u3 - c1345 * tmp2 * u4 )) #define jacld_b_value_14 (-dt * ty2 * ( - c2 * ( u1 * u2 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u1) #define jacld_b_value_24 (-dt * ty2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( u1 * u1 + 3.0e+00 * u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2) #define jacld_b_value_34 (-dt * ty2 * ( - c2 * ( u2 * u3 ) * tmp2 ) - dt * ty1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_b_value_44 (-dt * ty2 * ( c1 * ( u2 * tmp1 ) ) - dt * ty1 * c1345 * tmp1 - dt * ty1 * dy5) #define jacld_c_value_00 (-dt * tx1 * dx1) #define jacld_c_value_10 (-dt * tx2) #define jacld_c_value_20 (0.0e+00) #define jacld_c_value_30 (0.0e+00) #define jacld_c_value_40 (0.0e+00) #define jacld_c_value_01 (-dt * tx2 * ( - ( u1 * tmp1 ) * ( u1 * tmp1 ) + c2 * 0.5e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) - dt * tx1 * ( -r43 * c34 * tmp2 * u1 )) #define jacld_c_value_11 (-dt * tx2 * ( ( 2.0e+00 - c2 ) * ( u1 * tmp1 ) ) - dt * tx1 * ( r43 * c34 * tmp1 ) - dt * tx1 * dx2) #define jacld_c_value_21 (-dt * tx2 * ( -c2 * ( u2 * tmp1 ) )) #define jacld_c_value_31 (-dt * tx2 * ( -c2 * ( u3 * tmp1 ) )) #define jacld_c_value_41 (-dt * tx2 * c2) #define jacld_c_value_02 (-dt * tx2 * ( - ( u1 * u2 ) * tmp2 ) -dt * tx1 * ( - c34 * tmp2 * u2 )) #define jacld_c_value_12 (-dt * tx2 * ( u2 * tmp1 )) #define jacld_c_value_22 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx3) #define jacld_c_value_32 (0.0e+00) #define jacld_c_value_42 (0.0e+00) #define jacld_c_value_03 (-dt * tx2 * ( - ( u1 * u3 ) * tmp2 ) - dt * tx1 * ( - c34 * tmp2 * u3 )) #define jacld_c_value_13 (-dt * tx2 * ( u3 * tmp1 )) #define jacld_c_value_23 (0.0e+00) #define jacld_c_value_33 (-dt * tx2 * ( u1 * tmp1 ) - dt * tx1 * ( c34 * tmp1 ) - dt * tx1 * dx4) #define jacld_c_value_43 (0.0e+00) #define jacld_c_value_04 (-dt * tx2 * ( ( c2 * ( u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 - c1 * ( u4 * tmp1 ) ) * ( u1 * tmp1 ) ) - dt * tx1 * ( - (r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - c1345 * tmp2 * u4 )) #define jacld_c_value_14 (-dt * tx2 * ( c1 * ( u4 * tmp1 ) - 0.5e+00 * c2 * ( ( 3.0e+00 * u1 * u1 + u2 * u2 + u3 * u3 ) * tmp2 ) ) - dt * tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1) #define jacld_c_value_24 (-dt * tx2 * ( - c2 * ( u2 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u2) #define jacld_c_value_34 (-dt * tx2 * ( - c2 * ( u3 * u1 ) * tmp2 ) - dt * tx1 * ( c34 - c1345 ) * tmp2 * u3) #define jacld_c_value_44 (-dt * tx2 * ( c1 * ( u1 * tmp1 ) ) - dt * tx1 * c1345 * tmp1 - dt * tx1 * dx5) #define jacld_d_value_00 (1.0e+00 + dt * 2.0e+00 * ( tx1 * dx1 + ty1 * dy1 + tz1 * dz1 )) #define jacld_d_value_10 (0.0e+00) #define jacld_d_value_20 (0.0e+00) #define jacld_d_value_30 (0.0e+00) #define jacld_d_value_40 (0.0e+00) #define jacld_d_value_01 (dt * 2.0e+00 * ( tx1 * ( - r43 * c34 * tmp2 * u1 ) + ty1 * ( - c34 * tmp2 * u1 ) + tz1 * ( - c34 * tmp2 * u1 ) )) #define jacld_d_value_11 (1.0e+00 + dt * 2.0e+00 * ( tx1 * r43 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx2 + ty1 * dy2 + tz1 * dz2 )) #define jacld_d_value_21 (0.0e+00) #define jacld_d_value_31 (0.0e+00) #define jacld_d_value_41 (0.0e+00) #define jacld_d_value_02 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u2 ) + ty1 * ( - r43 * c34 * tmp2 * u2 ) + tz1 * ( - c34 * tmp2 * u2 ) )) #define jacld_d_value_12 (0.0e+00) #define jacld_d_value_22 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * r43 * c34 * tmp1 + tz1 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx3 + ty1 * dy3 + tz1 * dz3 )) #define jacld_d_value_32 (0.0e+00) #define jacld_d_value_42 (0.0e+00) #define jacld_d_value_03 (dt * 2.0e+00 * ( tx1 * ( - c34 * tmp2 * u3 ) + ty1 * ( - c34 * tmp2 * u3 ) + tz1 * ( - r43 * c34 * tmp2 * u3 ) )) #define jacld_d_value_13 (0.0e+00) #define jacld_d_value_23 (0.0e+00) #define jacld_d_value_33 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c34 * tmp1 + ty1 * c34 * tmp1 + tz1 * r43 * c34 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx4 + ty1 * dy4 + tz1 * dz4 )) #define jacld_d_value_43 (0.0e+00) #define jacld_d_value_04 (dt * 2.0e+00 * ( tx1 * ( - ( r43 * c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + ty1 * ( -( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * ( u4 ) ) + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u1 * u1 ) - ( c34 - c1345 ) * tmp3 * ( u2 * u2 ) - ( r43 * c34 - c1345 ) * tmp3 * ( u3 * u3 ) - ( c1345 ) * tmp2 * u4 ) )) #define jacld_d_value_14 (dt * 2.0e+00 * ( tx1 * ( r43 * c34 - c1345 ) * tmp2 * u1 + ty1 * ( c34 - c1345 ) * tmp2 * u1 + tz1 * ( c34 - c1345 ) * tmp2 * u1 )) #define jacld_d_value_24 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u2 + ty1 * ( r43 * c34 - c1345 ) * tmp2 * u2 + tz1 * ( c34 - c1345 ) * tmp2 * u2 )) #define jacld_d_value_34 (dt * 2.0e+00 * ( tx1 * ( c34 - c1345 ) * tmp2 * u3 + ty1 * ( c34 - c1345 ) * tmp2 * u3 + tz1 * ( r43 * c34 - c1345 ) * tmp2 * u3 )) #define jacld_d_value_44 (1.0e+00 + dt * 2.0e+00 * ( tx1 * c1345 * tmp1 + ty1 * c1345 * tmp1 + tz1 * c1345 * tmp1 ) + dt * 2.0e+00 * ( tx1 * dx5 + ty1 * dy5 + tz1 * dz5 )) // OpenCL kernel for blts step. __kernel void blts_kernel( __global double* v, __global const double* u, __global const int* wavefront_offsets_2d, __global const int* wavefront_offsets_3d, __global const int* columns, __global const int* rows, __global const int* thread_mapping, __const int wavefront, __const int starting_k) { // Get thread id. const int gid = get_global_id(0); const int threads = get_global_size(0); // Each thread actually processes (cells / threads) cells in a coalesced manner. int cell; for (cell = gid; cell < (isiz1 + 4) * (isiz2 + 4); cell += threads) { const int i = columns[cell]; const int j = rows[cell]; #ifdef APPLU_BLOCKING_OLD const int k = starting_k + (wavefront - (i + j)); const int depth = (wavefront - (i + j)); #else //#ifdef APPLU_BLOCKING_NEW const int k = wavefront - (i + j); #endif const double r43 = ( 4.0e+00 / 3.0e+00 ); const double c1345 = c1_def * c3_def * c4_def * c5_def; const double c34 = c3_def * c4_def; #ifdef APPLU_BLOCKING_OLD if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2 && depth >= 0 && depth <= kblock - 1) { #else //#ifdef APPLU_BLOCKING_NEW if (i >= ist && i <= iend && j >= jst && j <= jend && k >= 1 && k <= nz - 2) { #endif const int h_index = hyperplane_index(k, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Initialise values of v. double v0 = v[h_index + 0 * m_offset]; double v1 = v[h_index + 1 * m_offset]; double v2 = v[h_index + 2 * m_offset]; double v3 = v[h_index + 3 * m_offset]; double v4 = v[h_index + 4 * m_offset]; // Update the values of v based on the cell's neighbour in the k direction. // Calculate the index for (k-1, j, i). const int h_above = hyperplane_index(k-1, j, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of a. double u0 = u[h_above + 0 * m_offset]; double u1 = u[h_above + 1 * m_offset]; double u2 = u[h_above + 2 * m_offset]; double u3 = u[h_above + 3 * m_offset]; double u4 = u[h_above + 4 * m_offset]; // Compute some values based on u0. double tmp1 = 1.0e+00 / u0; double tmp2 = tmp1 * tmp1; double tmp3 = tmp1 * tmp2; double vn0 = v[h_above + 0 * m_offset]; v0 -= omega * ( jacld_a_value_00 * vn0 ); v1 -= omega * ( jacld_a_value_01 * vn0 ); v2 -= omega * ( jacld_a_value_02 * vn0 ); v3 -= omega * ( jacld_a_value_03 * vn0 ); v4 -= omega * ( jacld_a_value_04 * vn0 ); double vn1 = v[h_above + 1 * m_offset]; v0 -= omega * ( jacld_a_value_10 * vn1 ); v1 -= omega * ( jacld_a_value_11 * vn1 ); v2 -= omega * ( jacld_a_value_12 * vn1 ); v3 -= omega * ( jacld_a_value_13 * vn1 ); v4 -= omega * ( jacld_a_value_14 * vn1 ); double vn2 = v[h_above + 2 * m_offset]; v0 -= omega * ( jacld_a_value_20 * vn2 ); v1 -= omega * ( jacld_a_value_21 * vn2 ); v2 -= omega * ( jacld_a_value_22 * vn2 ); v3 -= omega * ( jacld_a_value_23 * vn2 ); v4 -= omega * ( jacld_a_value_24 * vn2 ); double vn3 = v[h_above + 3 * m_offset]; v0 -= omega * ( jacld_a_value_30 * vn3 ); v1 -= omega * ( jacld_a_value_31 * vn3 ); v2 -= omega * ( jacld_a_value_32 * vn3 ); v3 -= omega * ( jacld_a_value_33 * vn3 ); v4 -= omega * ( jacld_a_value_34 * vn3 ); double vn4 = v[h_above + 4 * m_offset]; v0 -= omega * ( jacld_a_value_40 * vn4 ); v1 -= omega * ( jacld_a_value_41 * vn4 ); v2 -= omega * ( jacld_a_value_42 * vn4 ); v3 -= omega * ( jacld_a_value_43 * vn4 ); v4 -= omega * ( jacld_a_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the j dimension. // Calculate the index of (k, j-1, i). const int h_north = hyperplane_index(k, j-1, i, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of b. u0 = u[h_north + 0 * m_offset]; u1 = u[h_north + 1 * m_offset]; u2 = u[h_north + 2 * m_offset]; u3 = u[h_north + 3 * m_offset]; u4 = u[h_north + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_north + 0 * m_offset]; v0 -= omega * ( jacld_b_value_00 * vn0 ); v1 -= omega * ( jacld_b_value_01 * vn0 ); v2 -= omega * ( jacld_b_value_02 * vn0 ); v3 -= omega * ( jacld_b_value_03 * vn0 ); v4 -= omega * ( jacld_b_value_04 * vn0 ); vn1 = v[h_north + 1 * m_offset]; v0 -= omega * ( jacld_b_value_10 * vn1 ); v1 -= omega * ( jacld_b_value_11 * vn1 ); v2 -= omega * ( jacld_b_value_12 * vn1 ); v3 -= omega * ( jacld_b_value_13 * vn1 ); v4 -= omega * ( jacld_b_value_14 * vn1 ); vn2 = v[h_north + 2 * m_offset]; v0 -= omega * ( jacld_b_value_20 * vn2 ); v1 -= omega * ( jacld_b_value_21 * vn2 ); v2 -= omega * ( jacld_b_value_22 * vn2 ); v3 -= omega * ( jacld_b_value_23 * vn2 ); v4 -= omega * ( jacld_b_value_24 * vn2 ); vn3 = v[h_north + 3 * m_offset]; v0 -= omega * ( jacld_b_value_30 * vn3 ); v1 -= omega * ( jacld_b_value_31 * vn3 ); v2 -= omega * ( jacld_b_value_32 * vn3 ); v3 -= omega * ( jacld_b_value_33 * vn3 ); v4 -= omega * ( jacld_b_value_34 * vn3 ); vn4 = v[h_north + 4 * m_offset]; v0 -= omega * ( jacld_b_value_40 * vn4 ); v1 -= omega * ( jacld_b_value_41 * vn4 ); v2 -= omega * ( jacld_b_value_42 * vn4 ); v3 -= omega * ( jacld_b_value_43 * vn4 ); v4 -= omega * ( jacld_b_value_44 * vn4 ); // Update the values of v based on the cell's neighbours in the i dimension. // Calculate the index of (k, j, i-1). const int h_west = hyperplane_index(k, j, i-1, 0, wavefront_offsets_2d, wavefront_offsets_3d, thread_mapping); // Read in u neighbours, for calculation of c. u0 = u[h_west + 0 * m_offset]; u1 = u[h_west + 1 * m_offset]; u2 = u[h_west + 2 * m_offset]; u3 = u[h_west + 3 * m_offset]; u4 = u[h_west + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; vn0 = v[h_west + 0 * m_offset]; v0 -= omega * ( jacld_c_value_00 * vn0 ); v1 -= omega * ( jacld_c_value_01 * vn0 ); v2 -= omega * ( jacld_c_value_02 * vn0 ); v3 -= omega * ( jacld_c_value_03 * vn0 ); v4 -= omega * ( jacld_c_value_04 * vn0 ); vn1 = v[h_west + 1 * m_offset]; v0 -= omega * ( jacld_c_value_10 * vn1 ); v1 -= omega * ( jacld_c_value_11 * vn1 ); v2 -= omega * ( jacld_c_value_12 * vn1 ); v3 -= omega * ( jacld_c_value_13 * vn1 ); v4 -= omega * ( jacld_c_value_14 * vn1 ); vn2 = v[h_west + 2 * m_offset]; v0 -= omega * ( jacld_c_value_20 * vn2 ); v1 -= omega * ( jacld_c_value_21 * vn2 ); v2 -= omega * ( jacld_c_value_22 * vn2 ); v3 -= omega * ( jacld_c_value_23 * vn2 ); v4 -= omega * ( jacld_c_value_24 * vn2 ); vn3 = v[h_west + 3 * m_offset]; v0 -= omega * ( jacld_c_value_30 * vn3 ); v1 -= omega * ( jacld_c_value_31 * vn3 ); v2 -= omega * ( jacld_c_value_32 * vn3 ); v3 -= omega * ( jacld_c_value_33 * vn3 ); v4 -= omega * ( jacld_c_value_34 * vn3 ); vn4 = v[h_west + 4 * m_offset]; v0 -= omega * ( jacld_c_value_40 * vn4 ); v1 -= omega * ( jacld_c_value_41 * vn4 ); v2 -= omega * ( jacld_c_value_42 * vn4 ); v3 -= omega * ( jacld_c_value_43 * vn4 ); v4 -= omega * ( jacld_c_value_44 * vn4 ); /** * Diagonal block inversion. * Forward elimination. */ // Read in u values. u0 = u[h_index + 0 * m_offset]; u1 = u[h_index + 1 * m_offset]; u2 = u[h_index + 2 * m_offset]; u3 = u[h_index + 3 * m_offset]; u4 = u[h_index + 4 * m_offset]; // Compute some values based on u0. tmp1 = 1.0e+00 / u0; tmp2 = tmp1 * tmp1; tmp3 = tmp1 * tmp2; double tmat00 = jacld_d_value_00; double tmat10 = jacld_d_value_10; double tmat20 = jacld_d_value_20; double tmat30 = jacld_d_value_30; double tmat40 = jacld_d_value_40; double tmat01 = jacld_d_value_01; double tmat11 = jacld_d_value_11; double tmat21 = jacld_d_value_21; double tmat31 = jacld_d_value_31; double tmat41 = jacld_d_value_41; double tmat02 = jacld_d_value_02; double tmat12 = jacld_d_value_12; double tmat22 = jacld_d_value_22; double tmat32 = jacld_d_value_32; double tmat42 = jacld_d_value_42; double tmat03 = jacld_d_value_03; double tmat13 = jacld_d_value_13; double tmat23 = jacld_d_value_23; double tmat33 = jacld_d_value_33; double tmat43 = jacld_d_value_43; double tmat04 = jacld_d_value_04; double tmat14 = jacld_d_value_14; double tmat24 = jacld_d_value_24; double tmat34 = jacld_d_value_34; double tmat44 = jacld_d_value_44; // ip = 0. double tmp; tmp1 = 1.0e+00 / tmat00; tmp = tmp1 * tmat01; tmat11 -= tmp * tmat10; tmat21 -= tmp * tmat20; tmat31 -= tmp * tmat30; tmat41 -= tmp * tmat40; v1 -= v0 * tmp; tmp = tmp1 * tmat02; tmat12 -= tmp * tmat10; tmat22 -= tmp * tmat20; tmat32 -= tmp * tmat30; tmat42 -= tmp * tmat40; v2 -= v0 * tmp; tmp = tmp1 * tmat03; tmat13 -= tmp * tmat10; tmat23 -= tmp * tmat20; tmat33 -= tmp * tmat30; tmat43 -= tmp * tmat40; v3 -= v0 * tmp; tmp = tmp1 * tmat04; tmat14 -= tmp * tmat10; tmat24 -= tmp * tmat20; tmat34 -= tmp * tmat30; tmat44 -= tmp * tmat40; v4 -= v0 * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat11; tmp = tmp1 * tmat12; tmat22 -= tmp * tmat21; tmat32 -= tmp * tmat31; tmat42 -= tmp * tmat41; v2 -= v1 * tmp; tmp = tmp1 * tmat13; tmat23 -= tmp * tmat21; tmat33 -= tmp * tmat31; tmat43 -= tmp * tmat41; v3 -= v1 * tmp; tmp = tmp1 * tmat14; tmat24 -= tmp * tmat21; tmat34 -= tmp * tmat31; tmat44 -= tmp * tmat41; v4 -= v1 * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat22; tmp = tmp1 * tmat23; tmat33 -= tmp * tmat32; tmat43 -= tmp * tmat42; v3 -= v2 * tmp; tmp = tmp1 * tmat24; tmat34 -= tmp * tmat32; tmat44 -= tmp * tmat42; v4 -= v2 * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat33; tmp = tmp1 * tmat34; tmat44 -= tmp * tmat43; v4 -= v3 * tmp; /** * Back substitution. */ v4 /= tmat44; v[h_index + 4 * m_offset] = v4; v3 -= tmat43 * v4; v3 /= tmat33; v[h_index + 3 * m_offset] = v3; v2 -= tmat32 * v3 - tmat42 * v4; v2 /= tmat22; v[h_index + 2 * m_offset] = v2; v1 -= tmat21 * v2 - tmat31 * v3 - tmat41 * v4; v1 /= tmat11; v[h_index + 1 * m_offset] = v1; v0 -= tmat10 * v1 - tmat20 * v2 - tmat30 * v3 - tmat40 * v4; v0 /= tmat00; v[h_index + 0 * m_offset] = v0; } } } kernels/vector2/rhs/zeta/.svn/entries0000444000175600017620000000166511551607760016325 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/zeta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_zeta3.cl file 2011-03-28T15:22:04.000000Z bbfbdd4abe0adb0dda1090ba53692ecf 2011-03-23T14:53:37.138628Z 1172 sjp 1802 rhs_zeta4.cl file 2011-03-28T15:22:04.000000Z c26ce76711d57332549e844692f82013 2011-03-23T14:53:37.138628Z 1172 sjp 2294 rhs_zeta_dissipation.cl file 2011-03-28T15:22:04.000000Z 1b9c5d058fa42b269e992154b4ea0599 2011-03-23T14:53:37.138628Z 1172 sjp 2484 rhs_zeta1.cl file 2011-03-28T15:22:04.000000Z 0c8ea304c9f9553c6dba3d1509915414 2011-03-23T14:53:37.138628Z 1172 sjp 1471 rhs_zeta2.cl file 2011-03-28T15:22:04.000000Z 15a7441d85c332a77f0ab7bafff215a2 2011-03-23T14:53:37.138628Z 1172 sjp 1405 kernels/vector2/rhs/xi/.svn/entries0000444000175600017620000000167111551607760015777 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/xi svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_xi_dissipation.cl file 2011-03-28T15:21:59.000000Z 15ea5b4435801f138ab5d4274e7f5d5b 2011-03-23T14:53:37.138628Z 1172 sjp 2511 rhs_xi1.cl file 1179 2011-03-29T14:26:15.000000Z 0bc23ab6d9282ba9c194b1f217a5a31f 2011-03-31T11:08:35.335679Z 1179 sjp 3027 rhs_xi2.cl file 1179 2011-03-29T14:26:22.000000Z cab6f80a4b5bc8a4e77ccdcb10a780b9 2011-03-31T11:08:35.335679Z 1179 sjp 2756 rhs_xi3.cl file 1179 2011-03-29T14:27:19.000000Z e079d32b796497343d253725bd964d65 2011-03-31T11:08:35.335679Z 1179 sjp 3942 rhs_xi4.cl file 1179 2011-03-29T14:27:26.000000Z 58e6770b3cb4b5d0fb91ff1c0808d4dc 2011-03-31T11:08:35.335679Z 1179 sjp 5205 kernels/vector2/rhs/eta/.svn/entries0000444000175600017620000000167311551607761016133 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/eta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_eta1.cl file 1179 2011-03-29T14:33:20.000000Z 0babb080e757c75c7cd3472a0e70166f 2011-03-31T11:08:35.335679Z 1179 sjp 2904 rhs_eta2.cl file 1179 2011-03-29T14:48:38.000000Z 6b3477d606969baff7d497ee17c2b30f 2011-03-31T11:08:35.335679Z 1179 sjp 2701 rhs_eta3.cl file 1179 2011-03-29T14:53:53.000000Z 72161d0362d178d2e99261ca913243a1 2011-03-31T11:08:35.335679Z 1179 sjp 3988 rhs_eta4.cl file 2011-03-28T15:22:00.000000Z 2b8d2fc469624b5ebbcea4546ecac5c7 2011-03-23T14:53:37.138628Z 1172 sjp 2315 rhs_eta_dissipation.cl file 2011-03-28T15:22:00.000000Z eb50830bbdd92d5e408772c89a14a9fa 2011-03-23T14:53:37.138628Z 1172 sjp 2748 kernels/vector2/rhs/.svn/text-base/rhs_setup.cl.svn-base0000444000175600017620000000353211545060262021717 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ibound = ((nx + 1)/vlength)*vlength; const int iinc = (isize * vlength); for (i = 2 + (iid*vlength); i <= ibound; i+= iinc) { vdouble rsd_v, frct_v; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; frct_v = vload(0, frct + t_index + 0 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 0 * t_offset); frct_v = vload(0, frct + t_index + 1 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 1 * t_offset); frct_v = vload(0, frct + t_index + 2 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 2 * t_offset); frct_v = vload(0, frct + t_index + 3 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 3 * t_offset); frct_v = vload(0, frct + t_index + 4 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 4 * t_offset); } for (; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/bak/rhs/eta/rhs_eta_dissipation.cl0000644000175600017620000000527411544123630020672 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector/bak/rhs/eta/rhs_eta4.cl0000644000175600017620000000441311544123630016342 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector/bak/rhs/eta/rhs_eta3.cl0000644000175600017620000000754611553015551016355 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21j = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31j = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41j = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51j = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j-1, i, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j-1, i, 1)); const vdouble u21jm1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j-1, i, 2)); const vdouble u31jm1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j-1, i, 3)); const vdouble u41jm1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j-1, i, 4)); const vdouble u51jm1 = tmp * u4m_v; const vdouble flux1_v = ty3 * ( u21j - u21jm1 ); const vdouble flux2_v = (4.0e+00/3.0e+00) * ty3 * ( u31j - u31jm1 ); const vdouble flux3_v = ty3 * ( u41j - u41jm1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21j = tmp * u[tiled_index(k, j, i, 1)]; const double u31j = tmp * u[tiled_index(k, j, i, 2)]; const double u41j = tmp * u[tiled_index(k, j, i, 3)]; const double u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; const double u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; const double u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; const double u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; const double u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/vector/bak/rhs/eta/rhs_eta2.cl0000644000175600017620000000515311553015562016346 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fjp_v, fjm_v; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 0)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 0)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 1)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 1)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 2)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 2)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 3)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 3)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 4)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 4)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/vector/bak/rhs/eta/rhs_eta1.cl0000644000175600017620000000545011553015574016350 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u31 = u2_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u2_v; vdouble flux1_v = u1_v * u31; vdouble flux2_v = u2_v * u31 + c2 * (u4_v - q); vdouble flux3_v = u3_v * u31; vdouble flux4_v = (c1 * u4_v - c2 * q) * u31; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } for (; i <= iend; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in the u values. double u0 = u[t_index + 0 * t_offset]; double u1 = u[t_index + 1 * t_offset]; double u2 = u[t_index + 2 * t_offset]; double u3 = u[t_index + 3 * t_offset]; double u4 = u[t_index + 4 * t_offset]; // Update flux. flux[t_index + 0 * t_offset] = u2; const double u31 = u2 / u0; const double q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[t_index + 1 * t_offset] = u1 * u31; flux[t_index + 2 * t_offset] = u2 * u31 + c2 * ( u4 - q ); flux[t_index + 3 * t_offset] = u3 * u31; flux[t_index + 4 * t_offset] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/vector/bak/rhs/.svn/entries0000444000175600017620000000056711551607761016036 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 xi dir zeta dir eta dir rhs_setup.cl file 1179 2011-03-29T14:22:41.000000Z 4a55f4166fdc1e0bfa3cf33fcfabe3e2 2011-03-31T11:08:35.335679Z 1179 sjp 1882 kernels/vector/bak/rhs/zeta/rhs_zeta_dissipation.cl0000644000175600017620000000466411544123634021264 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/bak/rhs/zeta/rhs_zeta4.cl0000644000175600017620000000436611544123634016741 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector/bak/rhs/zeta/rhs_zeta3.cl0000644000175600017620000000341211544123634016727 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector/bak/rhs/zeta/rhs_zeta2.cl0000644000175600017620000000257511544123634016737 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector/bak/rhs/zeta/rhs_zeta1.cl0000644000175600017620000000267711544123634016741 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector/bak/rhs/xi/rhs_xi_dissipation.cl0000644000175600017620000000471711544123627020417 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/bak/rhs/xi/rhs_xi4.cl0000644000175600017620000001212511553015412016055 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd0_v = vload(0, rsd + tiled_index(k, j, i, 0)); vdouble um0_v = vload(0, u + tiled_index(k, j, i-1, 0)); vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble up0_v = vload(0, u + tiled_index(k, j, i+1, 0)); rsd0_v += dx1 * tx1 * ( um0_v - 2.0e+00 * u0_v + up0_v ); vstore(rsd0_v, 0, rsd + tiled_index(k, j, i, 0)); vdouble rsd1_v = vload(0, rsd + tiled_index(k, j, i, 1)); vdouble um1_v = vload(0, u + tiled_index(k, j, i-1, 1)); vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); vdouble up1_v = vload(0, u + tiled_index(k, j, i+1, 1)); vdouble fluxp1_v = vload(0, flux + tiled_index(k, j, i+1, 1)); vdouble flux1_v = vload(0, flux + tiled_index(k, j, i, 1)); rsd1_v += tx3 * c3 * c4 * ( fluxp1_v - flux1_v ) + dx2 * tx1 * ( um1_v - 2.0e+00 * u1_v + up1_v ); vstore(rsd1_v, 0, rsd + tiled_index(k, j, i, 1)); vdouble rsd2_v = vload(0, rsd + tiled_index(k, j, i, 2)); vdouble um2_v = vload(0, u + tiled_index(k, j, i-1, 2)); vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); vdouble up2_v = vload(0, u + tiled_index(k, j, i+1, 2)); vdouble fluxp2_v = vload(0, flux + tiled_index(k, j, i+1, 2)); vdouble flux2_v = vload(0, flux + tiled_index(k, j, i, 2)); rsd2_v += tx3 * c3 * c4 * ( fluxp2_v - flux2_v ) + dx3 * tx1 * ( um2_v - 2.0e+00 * u2_v + up2_v ); vstore(rsd2_v, 0, rsd + tiled_index(k, j, i, 2)); vdouble rsd3_v = vload(0, rsd + tiled_index(k, j, i, 3)); vdouble um3_v = vload(0, u + tiled_index(k, j, i-1, 3)); vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); vdouble up3_v = vload(0, u + tiled_index(k, j, i+1, 3)); vdouble fluxp3_v = vload(0, flux + tiled_index(k, j, i+1, 3)); vdouble flux3_v = vload(0, flux + tiled_index(k, j, i, 3)); rsd3_v += tx3 * c3 * c4 * ( fluxp3_v - flux3_v ) + dx4 * tx1 * ( um3_v - 2.0e+00 * u3_v + up3_v ); vstore(rsd3_v, 0, rsd + tiled_index(k, j, i, 3)); vdouble rsd4_v = vload(0, rsd + tiled_index(k, j, i, 4)); vdouble um4_v = vload(0, u + tiled_index(k, j, i-1, 4)); vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); vdouble up4_v = vload(0, u + tiled_index(k, j, i+1, 4)); vdouble fluxp4_v = vload(0, flux + tiled_index(k, j, i+1, 4)); vdouble flux4_v = vload(0, flux + tiled_index(k, j, i, 4)); rsd4_v += tx3 * c3 * c4 * ( fluxp4_v - flux4_v ) + dx5 * tx1 * ( um4_v - 2.0e+00 * u4_v + up4_v ); vstore(rsd4_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector/bak/rhs/xi/rhs_xi3.cl0000644000175600017620000000754611553015327016074 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21i = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31i = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41i = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51i = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j, i-1, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j, i-1, 1)); const vdouble u21im1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j, i-1, 2)); const vdouble u31im1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j, i-1, 3)); const vdouble u41im1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j, i-1, 4)); const vdouble u51im1 = tmp * u4m_v; const vdouble flux1_v = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); const vdouble flux2_v = tx3 * ( u31i - u31im1 ); const vdouble flux3_v = tx3 * ( u41i - u41im1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= L2; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21i = tmp * u[tiled_index(k, j, i, 1)]; const double u31i = tmp * u[tiled_index(k, j, i, 2)]; const double u41i = tmp * u[tiled_index(k, j, i, 3)]; const double u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; const double u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; const double u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; const double u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; const double u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector/bak/rhs/xi/rhs_xi2.cl0000644000175600017620000000524211553015372016062 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fip_v, fim_v; const vdouble tx2_v = (vdouble) tx2; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 0)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 0)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 1)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 1)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 2)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 2)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 3)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 3)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 4)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 4)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/vector/bak/rhs/xi/rhs_xi1.cl0000644000175600017620000000564611553015400016061 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = L1 + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u21 = u1_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u1_v; vdouble flux1_v = u1_v * u21 + c2 * ( u4_v - q ); vdouble flux2_v = u2_v * u21; vdouble flux3_v = u3_v * u21; vdouble flux4_v = (c1 * u4_v - c2 * q) * u21; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } for (; i <= L2; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flux[t_index + 0 * t_offset] = u[t_index + 1 * t_offset]; const double u21 = u[t_index + 1 * t_offset] / u[t_index + 0 * t_offset]; const double q = 0.50e+00 * ( u[t_index + 1 * t_offset] * u[t_index + 1 * t_offset] + u[t_index + 2 * t_offset] * u[t_index + 2 * t_offset] + u[t_index + 3 * t_offset] * u[t_index + 3 * t_offset] ) / u[t_index + 0 * t_offset]; flux[t_index + 1 * t_offset] = u[t_index + 1 * t_offset] * u21 + c2 * ( u[t_index + 4 * t_offset] - q ); flux[t_index + 2 * t_offset] = u[t_index + 2 * t_offset] * u21; flux[t_index + 3 * t_offset] = u[t_index + 3 * t_offset] * u21; flux[t_index + 4 * t_offset] = ( c1 * u[t_index + 4 * t_offset] - c2 * q ) * u21; } } } } kernels/vector/rhs/xi/.svn/entries0000444000175600017620000000165111753220710015701 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/xi svn://svn/perfmodelling 2011-04-20T18:08:11.549895Z 1212 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_xi_dissipation.cl file 2011-04-20T18:07:04.000000Z 15ea5b4435801f138ab5d4274e7f5d5b 2011-03-23T14:53:37.138628Z 1172 sjp 2511 rhs_xi1.cl file 2011-04-20T18:07:04.000000Z a829da100c3112baa793312ba713c519 2011-04-20T18:08:11.549895Z 1212 sjp 1658 rhs_xi2.cl file 2011-04-20T18:07:04.000000Z 9301c20ad840ed153861478757aa0acc 2011-04-20T18:08:11.549895Z 1212 sjp 1181 rhs_xi3.cl file 2011-04-20T18:07:04.000000Z 718efee06bfcfadfdf933c1f2767ed4e 2011-04-20T18:08:11.549895Z 1212 sjp 1938 rhs_xi4.cl file 2011-04-20T18:07:04.000000Z 9425040f89f7eb3c5076158237ec5adc 2011-04-20T18:08:11.549895Z 1212 sjp 2181 kernels/vector/rhs/zeta/.svn/entries0000444000175600017620000000166511753220711016232 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/zeta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_zeta3.cl file 2011-04-20T18:07:12.000000Z bbfbdd4abe0adb0dda1090ba53692ecf 2011-03-23T14:53:37.138628Z 1172 sjp 1802 rhs_zeta4.cl file 2011-04-20T18:07:12.000000Z c26ce76711d57332549e844692f82013 2011-03-23T14:53:37.138628Z 1172 sjp 2294 rhs_zeta_dissipation.cl file 2011-04-20T18:07:12.000000Z 1b9c5d058fa42b269e992154b4ea0599 2011-03-23T14:53:37.138628Z 1172 sjp 2484 rhs_zeta1.cl file 2011-04-20T18:07:12.000000Z 0c8ea304c9f9553c6dba3d1509915414 2011-03-23T14:53:37.138628Z 1172 sjp 1471 rhs_zeta2.cl file 2011-04-20T18:07:12.000000Z 15a7441d85c332a77f0ab7bafff215a2 2011-03-23T14:53:37.138628Z 1172 sjp 1405 kernels/vector/rhs/eta/.svn/entries0000444000175600017620000000165711753220711016041 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/eta svn://svn/perfmodelling 2011-04-20T18:08:11.549895Z 1212 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_eta1.cl file 2011-04-20T18:07:08.000000Z 0de2226b22a143da5ac08561851676a5 2011-04-20T18:08:11.549895Z 1212 sjp 1534 rhs_eta2.cl file 2011-04-20T18:07:08.000000Z 2899b819175bbc5d3f2985f90d51e251 2011-04-20T18:08:11.549895Z 1212 sjp 1190 rhs_eta3.cl file 2011-04-20T18:07:08.000000Z 8a206bad74a0b919ae9327f9e76b948d 2011-04-20T18:08:11.549895Z 1212 sjp 1910 rhs_eta4.cl file 2011-04-20T18:07:08.000000Z 2b8d2fc469624b5ebbcea4546ecac5c7 2011-03-23T14:53:37.138628Z 1172 sjp 2315 rhs_eta_dissipation.cl file 2011-04-20T18:07:08.000000Z eb50830bbdd92d5e408772c89a14a9fa 2011-03-23T14:53:37.138628Z 1172 sjp 2748 kernels/vector/rhs/.svn/text-base/rhs_zeta.cl.svn-base0000444000175600017620000001602111553620400021430 0ustar sjpsjp/** * "Fused" version of rhs_zeta_{1,2,3,4,dissipation}. */ __kernel void rhs_zeta_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; int m; double q, u41; const double c1 = c1_def; const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_zeta1 for (k = 0 + kid; k <= nz - 1; k += ksize) { flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } // Rhs_zeta2 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } // Rhs_zeta3 for (k = 1 + kid; k <= nz - 1; k += ksize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } // Rhs_zeta4 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } // Rhs_zeta_dissipation for (k = 1 + kid; k <= nz - 2; k += ksize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/rhs/.svn/text-base/rhs_xi.cl.svn-base0000444000175600017620000001567311553620400021121 0ustar sjpsjp/** * "Fused" version of rhs_xi_{1,2,3,4,dissipation}. */ __kernel void rhs_xi_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ //double q, u21; //int L1, L2; int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; int m; double ist1, iend1; const double c1 = c1_def; //const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Set L1. /*if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; }*/ // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { // Rhs_xi1 /*for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; }*/ // Rhs_xi2 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } // Rhs_xi3 for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } // Rhs_xi4 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } // Rhs_xi_dissipation for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/rhs/.svn/text-base/rhs_setup.cl.svn-base0000444000175600017620000000162111553617672021646 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { for (i = 2 + iid; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/rhs/.svn/text-base/rhs_eta.cl.svn-base0000444000175600017620000001453311553620400021244 0ustar sjpsjp/** * "Fused" version of rhs_eta_{2,3,4,dissipation}. */ __kernel void rhs_eta_kernel ( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; int m; double jst1, jend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_eta2 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } // Rhs_eta3 for (j = jst + jid; j <= L2; j += jsize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } // Rhs_eta4 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } // Rhs_eta_dissipation for (j = jst + jid; j <= jend; j += jsize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/scalar/rhs/zeta/.svn/entries0000444000175600017620000000166511753220711016175 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/scalar/rhs/zeta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_zeta3.cl file 2011-04-19T15:38:08.000000Z bbfbdd4abe0adb0dda1090ba53692ecf 2011-03-23T14:53:37.138628Z 1172 sjp 1802 rhs_zeta4.cl file 2011-04-19T15:38:05.000000Z c26ce76711d57332549e844692f82013 2011-03-23T14:53:37.138628Z 1172 sjp 2294 rhs_zeta_dissipation.cl file 2011-04-19T15:38:03.000000Z 1b9c5d058fa42b269e992154b4ea0599 2011-03-23T14:53:37.138628Z 1172 sjp 2484 rhs_zeta1.cl file 2011-04-19T15:38:19.000000Z 0c8ea304c9f9553c6dba3d1509915414 2011-03-23T14:53:37.138628Z 1172 sjp 1471 rhs_zeta2.cl file 2011-04-19T15:38:10.000000Z 15a7441d85c332a77f0ab7bafff215a2 2011-03-23T14:53:37.138628Z 1172 sjp 1405 kernels/scalar/rhs/xi/.svn/entries0000444000175600017620000000165111753220711015645 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/scalar/rhs/xi svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_xi_dissipation.cl file 2011-03-21T11:55:04.000000Z 15ea5b4435801f138ab5d4274e7f5d5b 2011-03-23T14:53:37.138628Z 1172 sjp 2511 rhs_xi1.cl file 2011-05-30T16:03:46.619803Z a829da100c3112baa793312ba713c519 2011-03-23T14:53:37.138628Z 1172 sjp 1658 rhs_xi2.cl file 2011-03-21T11:51:37.000000Z 9301c20ad840ed153861478757aa0acc 2011-03-23T14:53:37.138628Z 1172 sjp 1181 rhs_xi3.cl file 2011-03-21T11:51:58.000000Z 718efee06bfcfadfdf933c1f2767ed4e 2011-03-23T14:53:37.138628Z 1172 sjp 1938 rhs_xi4.cl file 2011-03-21T11:52:45.000000Z 9425040f89f7eb3c5076158237ec5adc 2011-03-23T14:53:37.138628Z 1172 sjp 2181 kernels/scalar/rhs/eta/.svn/entries0000444000175600017620000000165711753220711016004 0ustar sjpsjp10 dir 1538 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/scalar/rhs/eta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_eta1.cl file 2011-04-19T15:39:35.000000Z 0de2226b22a143da5ac08561851676a5 2011-03-23T14:53:37.138628Z 1172 sjp 1534 rhs_eta2.cl file 2011-04-19T15:39:37.000000Z 2899b819175bbc5d3f2985f90d51e251 2011-03-23T14:53:37.138628Z 1172 sjp 1190 rhs_eta3.cl file 2011-04-19T15:39:39.000000Z 8a206bad74a0b919ae9327f9e76b948d 2011-03-23T14:53:37.138628Z 1172 sjp 1910 rhs_eta4.cl file 2011-04-19T15:39:41.000000Z 2b8d2fc469624b5ebbcea4546ecac5c7 2011-03-23T14:53:37.138628Z 1172 sjp 2315 rhs_eta_dissipation.cl file 2011-04-19T15:39:45.000000Z eb50830bbdd92d5e408772c89a14a9fa 2011-03-23T14:53:37.138628Z 1172 sjp 2748 kernels/scalar/rhs/.svn/text-base/rhs_zeta.cl.svn-base0000444000175600017620000001602111553617671021413 0ustar sjpsjp/** * "Fused" version of rhs_zeta_{1,2,3,4,dissipation}. */ __kernel void rhs_zeta_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; int m; double q, u41; const double c1 = c1_def; const double c2 = c2_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_zeta1 for (k = 0 + kid; k <= nz - 1; k += ksize) { flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } // Rhs_zeta2 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } // Rhs_zeta3 for (k = 1 + kid; k <= nz - 1; k += ksize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } // Rhs_zeta4 for (k = 1 + kid; k <= nz - 2; k += ksize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } // Rhs_zeta_dissipation for (k = 1 + kid; k <= nz - 2; k += ksize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/scalar/rhs/.svn/text-base/rhs_xi.cl.svn-base0000444000175600017620000001374111553617672021077 0ustar sjpsjp/** * "Fused" version of rhs_xi_{2,3,4,dissipation}. */ __kernel void rhs_xi_kernel( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; int m; double ist1, iend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { // Rhs_xi2 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } // Rhs_xi3 for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } // Rhs_xi4 for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } // Rhs_xi_dissipation for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/scalar/rhs/.svn/text-base/rhs_setup.cl.svn-base0000444000175600017620000000162111542404560021575 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { for (i = 2 + iid; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/scalar/rhs/.svn/text-base/rhs_eta.cl.svn-base0000444000175600017620000001453311553617671021227 0ustar sjpsjp/** * "Fused" version of rhs_eta_{2,3,4,dissipation}. */ __kernel void rhs_eta_kernel ( __global const double* u, __global double* rsd, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; int m; double jst1, jend1; const double c1 = c1_def; const double c3 = c3_def; const double c4 = c4_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (i = ist + iid; i <= iend; i += isize) { // Rhs_eta2 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } // Rhs_eta3 for (j = jst + jid; j <= L2; j += jsize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } // Rhs_eta4 for (j = jst + jid; j <= jend; j += jsize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } // Rhs_eta_dissipation for (j = jst + jid; j <= jend; j += jsize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector2/rhs/zeta/.svn/text-base/rhs_zeta_dissipation.cl.svn-base0000444000175600017620000000466411542404557025110 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector2/rhs/zeta/.svn/text-base/rhs_zeta4.cl.svn-base0000444000175600017620000000436611542404560022557 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector2/rhs/zeta/.svn/text-base/rhs_zeta3.cl.svn-base0000444000175600017620000000341211542404556022552 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector2/rhs/zeta/.svn/text-base/rhs_zeta2.cl.svn-base0000444000175600017620000000257511542404556022562 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector2/rhs/zeta/.svn/text-base/rhs_zeta1.cl.svn-base0000444000175600017620000000267711542404557022565 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector2/rhs/xi/.svn/text-base/rhs_xi_dissipation.cl.svn-base0000444000175600017620000000471711542404560024233 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector2/rhs/xi/.svn/text-base/rhs_xi4.cl.svn-base0000444000175600017620000001212511545060262021701 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd0_v = vload(0, rsd + tiled_index(k, j, i, 0)); vdouble um0_v = vload(0, u + tiled_index(k, j, i-1, 0)); vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble up0_v = vload(0, u + tiled_index(k, j, i+1, 0)); rsd0_v += dx1 * tx1 * ( um0_v - 2.0e+00 * u0_v + up0_v ); vstore(rsd0_v, 0, rsd + tiled_index(k, j, i, 0)); vdouble rsd1_v = vload(0, rsd + tiled_index(k, j, i, 1)); vdouble um1_v = vload(0, u + tiled_index(k, j, i-1, 1)); vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); vdouble up1_v = vload(0, u + tiled_index(k, j, i+1, 1)); vdouble fluxp1_v = vload(0, flux + tiled_index(k, j, i+1, 1)); vdouble flux1_v = vload(0, flux + tiled_index(k, j, i, 1)); rsd1_v += tx3 * c3 * c4 * ( fluxp1_v - flux1_v ) + dx2 * tx1 * ( um1_v - 2.0e+00 * u1_v + up1_v ); vstore(rsd1_v, 0, rsd + tiled_index(k, j, i, 1)); vdouble rsd2_v = vload(0, rsd + tiled_index(k, j, i, 2)); vdouble um2_v = vload(0, u + tiled_index(k, j, i-1, 2)); vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); vdouble up2_v = vload(0, u + tiled_index(k, j, i+1, 2)); vdouble fluxp2_v = vload(0, flux + tiled_index(k, j, i+1, 2)); vdouble flux2_v = vload(0, flux + tiled_index(k, j, i, 2)); rsd2_v += tx3 * c3 * c4 * ( fluxp2_v - flux2_v ) + dx3 * tx1 * ( um2_v - 2.0e+00 * u2_v + up2_v ); vstore(rsd2_v, 0, rsd + tiled_index(k, j, i, 2)); vdouble rsd3_v = vload(0, rsd + tiled_index(k, j, i, 3)); vdouble um3_v = vload(0, u + tiled_index(k, j, i-1, 3)); vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); vdouble up3_v = vload(0, u + tiled_index(k, j, i+1, 3)); vdouble fluxp3_v = vload(0, flux + tiled_index(k, j, i+1, 3)); vdouble flux3_v = vload(0, flux + tiled_index(k, j, i, 3)); rsd3_v += tx3 * c3 * c4 * ( fluxp3_v - flux3_v ) + dx4 * tx1 * ( um3_v - 2.0e+00 * u3_v + up3_v ); vstore(rsd3_v, 0, rsd + tiled_index(k, j, i, 3)); vdouble rsd4_v = vload(0, rsd + tiled_index(k, j, i, 4)); vdouble um4_v = vload(0, u + tiled_index(k, j, i-1, 4)); vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); vdouble up4_v = vload(0, u + tiled_index(k, j, i+1, 4)); vdouble fluxp4_v = vload(0, flux + tiled_index(k, j, i+1, 4)); vdouble flux4_v = vload(0, flux + tiled_index(k, j, i, 4)); rsd4_v += tx3 * c3 * c4 * ( fluxp4_v - flux4_v ) + dx5 * tx1 * ( um4_v - 2.0e+00 * u4_v + up4_v ); vstore(rsd4_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector2/rhs/xi/.svn/text-base/rhs_xi3.cl.svn-base0000444000175600017620000000754611545060262021713 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21i = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31i = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41i = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51i = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j, i-1, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j, i-1, 1)); const vdouble u21im1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j, i-1, 2)); const vdouble u31im1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j, i-1, 3)); const vdouble u41im1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j, i-1, 4)); const vdouble u51im1 = tmp * u4m_v; const vdouble flux1_v = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); const vdouble flux2_v = tx3 * ( u31i - u31im1 ); const vdouble flux3_v = tx3 * ( u41i - u41im1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= L2; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21i = tmp * u[tiled_index(k, j, i, 1)]; const double u31i = tmp * u[tiled_index(k, j, i, 2)]; const double u41i = tmp * u[tiled_index(k, j, i, 3)]; const double u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; const double u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; const double u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; const double u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; const double u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector2/rhs/xi/.svn/text-base/rhs_xi2.cl.svn-base0000444000175600017620000000530411545060262021700 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fip_v, fim_v; const vdouble tx2_v = (vdouble) tx2; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 0)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 0)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 1)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 1)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 2)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 2)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 3)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 3)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 4)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 4)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } } kernels/vector2/rhs/xi/.svn/text-base/rhs_xi1.cl.svn-base0000444000175600017620000000572311545060262021704 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = L1 + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u21 = u1_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u1_v; vdouble flux1_v = u1_v * u21 + c2 * ( u4_v - q ); vdouble flux2_v = u2_v * u21; vdouble flux3_v = u3_v * u21; vdouble flux4_v = (c1 * u4_v - c2 * q) * u21; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } if (iid == 0) { for (; i <= L2; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flux[t_index + 0 * t_offset] = u[t_index + 1 * t_offset]; const double u21 = u[t_index + 1 * t_offset] / u[t_index + 0 * t_offset]; const double q = 0.50e+00 * ( u[t_index + 1 * t_offset] * u[t_index + 1 * t_offset] + u[t_index + 2 * t_offset] * u[t_index + 2 * t_offset] + u[t_index + 3 * t_offset] * u[t_index + 3 * t_offset] ) / u[t_index + 0 * t_offset]; flux[t_index + 1 * t_offset] = u[t_index + 1 * t_offset] * u21 + c2 * ( u[t_index + 4 * t_offset] - q ); flux[t_index + 2 * t_offset] = u[t_index + 2 * t_offset] * u21; flux[t_index + 3 * t_offset] = u[t_index + 3 * t_offset] * u21; flux[t_index + 4 * t_offset] = ( c1 * u[t_index + 4 * t_offset] - c2 * q ) * u21; } } } } } kernels/vector2/rhs/eta/.svn/text-base/rhs_eta_dissipation.cl.svn-base0000444000175600017620000000527411542404556024521 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector2/rhs/eta/.svn/text-base/rhs_eta4.cl.svn-base0000444000175600017620000000441311542404557022172 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector2/rhs/eta/.svn/text-base/rhs_eta3.cl.svn-base0000444000175600017620000000762411545060261022171 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21j = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31j = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41j = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51j = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j-1, i, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j-1, i, 1)); const vdouble u21jm1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j-1, i, 2)); const vdouble u31jm1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j-1, i, 3)); const vdouble u41jm1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j-1, i, 4)); const vdouble u51jm1 = tmp * u4m_v; const vdouble flux1_v = ty3 * ( u21j - u21jm1 ); const vdouble flux2_v = (4.0e+00/3.0e+00) * ty3 * ( u31j - u31jm1 ); const vdouble flux3_v = ty3 * ( u41j - u41jm1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21j = tmp * u[tiled_index(k, j, i, 1)]; const double u31j = tmp * u[tiled_index(k, j, i, 2)]; const double u41j = tmp * u[tiled_index(k, j, i, 3)]; const double u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; const double u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; const double u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; const double u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; const double u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } } kernels/vector2/rhs/eta/.svn/text-base/rhs_eta2.cl.svn-base0000444000175600017620000000521511545060262022163 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fjp_v, fjm_v; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 0)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 0)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 1)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 1)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 2)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 2)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 3)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 3)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 4)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 4)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } } kernels/vector2/rhs/eta/.svn/text-base/rhs_eta1.cl.svn-base0000444000175600017620000000553011545060262022162 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u31 = u2_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u2_v; vdouble flux1_v = u1_v * u31; vdouble flux2_v = u2_v * u31 + c2 * (u4_v - q); vdouble flux3_v = u3_v * u31; vdouble flux4_v = (c1 * u4_v - c2 * q) * u31; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } if (iid == 0) { for (; i <= iend; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in the u values. double u0 = u[t_index + 0 * t_offset]; double u1 = u[t_index + 1 * t_offset]; double u2 = u[t_index + 2 * t_offset]; double u3 = u[t_index + 3 * t_offset]; double u4 = u[t_index + 4 * t_offset]; // Update flux. flux[t_index + 0 * t_offset] = u2; const double u31 = u2 / u0; const double q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[t_index + 1 * t_offset] = u1 * u31; flux[t_index + 2 * t_offset] = u2 * u31 + c2 * ( u4 - q ); flux[t_index + 3 * t_offset] = u3 * u31; flux[t_index + 4 * t_offset] = ( c1 * u4 - c2 * q ) * u31; } } } } } kernels/vector/bak/rhs/eta/.svn/entries0000444000175600017620000000167311551607761016606 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/eta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_eta1.cl file 1179 2011-03-29T14:33:20.000000Z 0babb080e757c75c7cd3472a0e70166f 2011-03-31T11:08:35.335679Z 1179 sjp 2904 rhs_eta2.cl file 1179 2011-03-29T14:48:38.000000Z 6b3477d606969baff7d497ee17c2b30f 2011-03-31T11:08:35.335679Z 1179 sjp 2701 rhs_eta3.cl file 1179 2011-03-29T14:53:53.000000Z 72161d0362d178d2e99261ca913243a1 2011-03-31T11:08:35.335679Z 1179 sjp 3988 rhs_eta4.cl file 2011-03-28T15:22:00.000000Z 2b8d2fc469624b5ebbcea4546ecac5c7 2011-03-23T14:53:37.138628Z 1172 sjp 2315 rhs_eta_dissipation.cl file 2011-03-28T15:22:00.000000Z eb50830bbdd92d5e408772c89a14a9fa 2011-03-23T14:53:37.138628Z 1172 sjp 2748 kernels/vector/bak/rhs/.svn/text-base/rhs_setup.cl.svn-base0000444000175600017620000000353211545060262022372 0ustar sjpsjp// OpenCL for updating rsd based on frct. __kernel void rhs_setup_kernel( __global double* rsd, __global const double* frct) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = 2 + jid; j <= ny + 1; j += jsize) { const int ibound = ((nx + 1)/vlength)*vlength; const int iinc = (isize * vlength); for (i = 2 + (iid*vlength); i <= ibound; i+= iinc) { vdouble rsd_v, frct_v; const int t_index = tiled_index(k, j, i, 0); const int t_offset = (isiz1 + 4) * (isiz2 + 4) * isiz3; frct_v = vload(0, frct + t_index + 0 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 0 * t_offset); frct_v = vload(0, frct + t_index + 1 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 1 * t_offset); frct_v = vload(0, frct + t_index + 2 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 2 * t_offset); frct_v = vload(0, frct + t_index + 3 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 3 * t_offset); frct_v = vload(0, frct + t_index + 4 * t_offset); rsd_v = -frct_v; vstore(rsd_v, 0, rsd + t_index + 4 * t_offset); } for (; i <= nx + 1; i += isize) { rsd[tiled_index(k, j, i, 0)] = -frct[tiled_index(k, j, i, 0)]; rsd[tiled_index(k, j, i, 1)] = -frct[tiled_index(k, j, i, 1)]; rsd[tiled_index(k, j, i, 2)] = -frct[tiled_index(k, j, i, 2)]; rsd[tiled_index(k, j, i, 3)] = -frct[tiled_index(k, j, i, 3)]; rsd[tiled_index(k, j, i, 4)] = -frct[tiled_index(k, j, i, 4)]; } } } } kernels/vector/bak/rhs/zeta/.svn/entries0000444000175600017620000000166511551607760017000 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/zeta svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_zeta3.cl file 2011-03-28T15:22:04.000000Z bbfbdd4abe0adb0dda1090ba53692ecf 2011-03-23T14:53:37.138628Z 1172 sjp 1802 rhs_zeta4.cl file 2011-03-28T15:22:04.000000Z c26ce76711d57332549e844692f82013 2011-03-23T14:53:37.138628Z 1172 sjp 2294 rhs_zeta_dissipation.cl file 2011-03-28T15:22:04.000000Z 1b9c5d058fa42b269e992154b4ea0599 2011-03-23T14:53:37.138628Z 1172 sjp 2484 rhs_zeta1.cl file 2011-03-28T15:22:04.000000Z 0c8ea304c9f9553c6dba3d1509915414 2011-03-23T14:53:37.138628Z 1172 sjp 1471 rhs_zeta2.cl file 2011-03-28T15:22:04.000000Z 15a7441d85c332a77f0ab7bafff215a2 2011-03-23T14:53:37.138628Z 1172 sjp 1405 kernels/vector/bak/rhs/xi/.svn/entries0000444000175600017620000000167111551607760016452 0ustar sjpsjp10 dir 1178 svn://svn/perfmodelling/trunk/gpu/opencl/lu/kernels/vector/rhs/xi svn://svn/perfmodelling 2011-03-23T14:53:37.138628Z 1172 sjp e58d8ee3-8805-4e15-a2a8-cc27420ae2e2 rhs_xi_dissipation.cl file 2011-03-28T15:21:59.000000Z 15ea5b4435801f138ab5d4274e7f5d5b 2011-03-23T14:53:37.138628Z 1172 sjp 2511 rhs_xi1.cl file 1179 2011-03-29T14:26:15.000000Z 0bc23ab6d9282ba9c194b1f217a5a31f 2011-03-31T11:08:35.335679Z 1179 sjp 3027 rhs_xi2.cl file 1179 2011-03-29T14:26:22.000000Z cab6f80a4b5bc8a4e77ccdcb10a780b9 2011-03-31T11:08:35.335679Z 1179 sjp 2756 rhs_xi3.cl file 1179 2011-03-29T14:27:19.000000Z e079d32b796497343d253725bd964d65 2011-03-31T11:08:35.335679Z 1179 sjp 3942 rhs_xi4.cl file 1179 2011-03-29T14:27:26.000000Z 58e6770b3cb4b5d0fb91ff1c0808d4dc 2011-03-31T11:08:35.335679Z 1179 sjp 5205 kernels/vector/rhs/xi/.svn/text-base/rhs_xi_dissipation.cl.svn-base0000444000175600017620000000471711553620275024156 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/rhs/xi/.svn/text-base/rhs_xi4.cl.svn-base0000444000175600017620000000420511553620401021613 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector/rhs/xi/.svn/text-base/rhs_xi3.cl.svn-base0000444000175600017620000000362211553620400021613 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector/rhs/xi/.svn/text-base/rhs_xi2.cl.svn-base0000444000175600017620000000223511553620400021611 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/vector/rhs/xi/.svn/text-base/rhs_xi1.cl.svn-base0000444000175600017620000000317211553620400021611 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u21; int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } } } kernels/vector/rhs/zeta/.svn/text-base/rhs_zeta_dissipation.cl.svn-base0000444000175600017620000000466411553620277025027 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/rhs/zeta/.svn/text-base/rhs_zeta4.cl.svn-base0000444000175600017620000000436611553620276022503 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta/.svn/text-base/rhs_zeta3.cl.svn-base0000444000175600017620000000341211553620276022471 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector/rhs/zeta/.svn/text-base/rhs_zeta2.cl.svn-base0000444000175600017620000000257511553620277022502 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector/rhs/zeta/.svn/text-base/rhs_zeta1.cl.svn-base0000444000175600017620000000267711553620277022504 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector/rhs/eta/.svn/text-base/rhs_eta_dissipation.cl.svn-base0000444000175600017620000000527411553620277024441 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector/rhs/eta/.svn/text-base/rhs_eta4.cl.svn-base0000444000175600017620000000441311553620277022111 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector/rhs/eta/.svn/text-base/rhs_eta3.cl.svn-base0000444000175600017620000000356611553620401022105 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/vector/rhs/eta/.svn/text-base/rhs_eta2.cl.svn-base0000444000175600017620000000224611553620401022076 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/vector/rhs/eta/.svn/text-base/rhs_eta1.cl.svn-base0000444000175600017620000000277611553620400022104 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u31; int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Read in the u values. double u0 = u[tiled_index(k, j, i, 0)]; double u1 = u[tiled_index(k, j, i, 1)]; double u2 = u[tiled_index(k, j, i, 2)]; double u3 = u[tiled_index(k, j, i, 3)]; double u4 = u[tiled_index(k, j, i, 4)]; // Update flux. flux[tiled_index(k, j, i, 0)] = u2; u31 = u2 / u0; q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[tiled_index(k, j, i, 1)] = u1 * u31; flux[tiled_index(k, j, i, 2)] = u2 * u31 + c2 * ( u4 - q ); flux[tiled_index(k, j, i, 3)] = u3 * u31; flux[tiled_index(k, j, i, 4)] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/scalar/rhs/zeta/.svn/text-base/rhs_zeta_dissipation.cl.svn-base0000444000175600017620000000466411542404556024770 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/scalar/rhs/zeta/.svn/text-base/rhs_zeta4.cl.svn-base0000444000175600017620000000436611542404556022445 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/scalar/rhs/zeta/.svn/text-base/rhs_zeta3.cl.svn-base0000444000175600017620000000341211542404557022434 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/scalar/rhs/zeta/.svn/text-base/rhs_zeta2.cl.svn-base0000444000175600017620000000257511542404560022436 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/scalar/rhs/zeta/.svn/text-base/rhs_zeta1.cl.svn-base0000444000175600017620000000267711542404556022445 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/scalar/rhs/xi/.svn/text-base/rhs_xi_dissipation.cl.svn-base0000444000175600017620000000471711542404560024114 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/scalar/rhs/xi/.svn/text-base/rhs_xi4.cl.svn-base0000444000175600017620000000420511542404556021567 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/scalar/rhs/xi/.svn/text-base/rhs_xi3.cl.svn-base0000444000175600017620000000362211542404557021571 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21i, u31i, u41i, u51i; double u21im1, u31im1, u41im1, u51im1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= L2; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21i = tmp * u[tiled_index(k, j, i, 1)]; u31i = tmp * u[tiled_index(k, j, i, 2)]; u41i = tmp * u[tiled_index(k, j, i, 3)]; u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/scalar/rhs/xi/.svn/text-base/rhs_xi2.cl.svn-base0000444000175600017620000000223511542404560021561 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } kernels/scalar/rhs/xi/.svn/text-base/rhs_xi1.cl.svn-base0000444000175600017620000000317211542404561021562 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u21; int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = L1 + iid; i <= L2; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 1)]; u21 = u[tiled_index(k, j, i, 1)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u21 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u21; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u21; flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u21; } } } } kernels/scalar/rhs/eta/.svn/text-base/rhs_eta_dissipation.cl.svn-base0000444000175600017620000000527411542404557024403 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/scalar/rhs/eta/.svn/text-base/rhs_eta4.cl.svn-base0000444000175600017620000000441311542404560022045 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/scalar/rhs/eta/.svn/text-base/rhs_eta3.cl.svn-base0000444000175600017620000000356611542404560022054 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; double u21j, u31j, u41j, u51j; double u21jm1, u31jm1, u41jm1, u51jm1; double tmp; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21j = tmp * u[tiled_index(k, j, i, 1)]; u31j = tmp * u[tiled_index(k, j, i, 2)]; u41j = tmp * u[tiled_index(k, j, i, 3)]; u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } kernels/scalar/rhs/eta/.svn/text-base/rhs_eta2.cl.svn-base0000444000175600017620000000224611542404556022052 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } kernels/scalar/rhs/eta/.svn/text-base/rhs_eta1.cl.svn-base0000444000175600017620000000277611542404557022062 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u31; int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Read in the u values. double u0 = u[tiled_index(k, j, i, 0)]; double u1 = u[tiled_index(k, j, i, 1)]; double u2 = u[tiled_index(k, j, i, 2)]; double u3 = u[tiled_index(k, j, i, 3)]; double u4 = u[tiled_index(k, j, i, 4)]; // Update flux. flux[tiled_index(k, j, i, 0)] = u2; u31 = u2 / u0; q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[tiled_index(k, j, i, 1)] = u1 * u31; flux[tiled_index(k, j, i, 2)] = u2 * u31 + c2 * ( u4 - q ); flux[tiled_index(k, j, i, 3)] = u3 * u31; flux[tiled_index(k, j, i, 4)] = ( c1 * u4 - c2 * q ) * u31; } } } } kernels/vector/bak/rhs/eta/.svn/text-base/rhs_eta_dissipation.cl.svn-base0000444000175600017620000000527411542404556025174 0ustar sjpsjp/** * Fourth-order dissipation in the eta-direction. */ // TODO: Unroll some of these m loops. __kernel void rhs_eta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double jst1, jend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (west == -1) { for (m = 0; m < 5; m++) { if (j == 3) { rsd[tiled_index(k, 3, i, m)] = rsd[tiled_index(k, 3, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(k, 3, i, m)] - 4.0e+00 * u[tiled_index(k, 4, i, m)] + u[tiled_index(k, 5, i, m)] ); } if (j == 4) { rsd[tiled_index(k, 4, i, m)] = rsd[tiled_index(k, 4, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(k, 3, i, m)] + 6.0e+00 * u[tiled_index(k, 4, i, m)] - 4.0e+00 * u[tiled_index(k, 5, i, m)] + u[tiled_index(k, 6, i, m)] ); } } } // Update jst1 and jend1 based on east and west. if (west != -1) { jst1 = 2; } if (east != -1) { jend1 = ny + 1; } if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } // If j is in range, update rsd. if (j >= jst1 && j <= jend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k, j-2, i, m)] - 4.0e+00 * u[tiled_index(k, j-1, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j+1, i, m)] + u[tiled_index(k, j+2, i, m)] ); } } if (east == -1) { for (m = 0; m < 5; m++) { if (j == ny - 1) { rsd[tiled_index(k, ny-1, i, m)] = rsd[tiled_index(k, ny-1, i, m)] - dssp * ( u[tiled_index(k, ny-3, i, m)] - 4.0e+00 * u[tiled_index(k, ny-2, i, m)] + 6.0e+00 * u[tiled_index(k, ny-1, i, m)] - 4.0e+00 * u[tiled_index(k, ny, i, m)] ); } if (j == ny) { rsd[tiled_index(k, ny, i, m)] = rsd[tiled_index(k, ny, i, m)] - dssp * ( u[tiled_index(k, ny-2, i, m)] - 4.0e+00 * u[tiled_index(k, ny-1, i, m)] + 5.0e+00 * u[tiled_index(k, ny, i, m)] ); } } } } } } } kernels/vector/bak/rhs/eta/.svn/text-base/rhs_eta4.cl.svn-base0000444000175600017620000000441311542404557022645 0ustar sjpsjp/** * Fourth part of eta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_eta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dy1 * ty1 * ( u[tiled_index(k, j-1, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j+1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + ty3 * c3 * c4 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dy2 * ty1 * ( u[tiled_index(k, j-1, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j+1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dy3 * ty1 * ( u[tiled_index(k, j-1, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j+1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dy4 * ty1 * ( u[tiled_index(k, j-1, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j+1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + ty3 * c3 * c4 * (flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dy5 * ty1 * ( u[tiled_index(k, j-1, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j+1, i, 4)] ); } } } } kernels/vector/bak/rhs/eta/.svn/text-base/rhs_eta3.cl.svn-base0000444000175600017620000000762411545060261022644 0ustar sjpsjp/** * Third part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on east. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21j = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31j = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41j = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51j = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j-1, i, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j-1, i, 1)); const vdouble u21jm1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j-1, i, 2)); const vdouble u31jm1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j-1, i, 3)); const vdouble u41jm1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j-1, i, 4)); const vdouble u51jm1 = tmp * u4m_v; const vdouble flux1_v = ty3 * ( u21j - u21jm1 ); const vdouble flux2_v = (4.0e+00/3.0e+00) * ty3 * ( u31j - u31jm1 ); const vdouble flux3_v = ty3 * ( u41j - u41jm1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21j = tmp * u[tiled_index(k, j, i, 1)]; const double u31j = tmp * u[tiled_index(k, j, i, 2)]; const double u41j = tmp * u[tiled_index(k, j, i, 3)]; const double u51j = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j-1, i, 0)]; const double u21jm1 = tmp * u[tiled_index(k, j-1, i, 1)]; const double u31jm1 = tmp * u[tiled_index(k, j-1, i, 2)]; const double u41jm1 = tmp * u[tiled_index(k, j-1, i, 3)]; const double u51jm1 = tmp * u[tiled_index(k, j-1, i, 4)]; flux[tiled_index(k, j, i, 1)] = ty3 * ( u21j - u21jm1 ); flux[tiled_index(k, j, i, 2)] = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux[tiled_index(k, j, i, 3)] = ty3 * ( u41j - u41jm1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } } } } } kernels/vector/bak/rhs/eta/.svn/text-base/rhs_eta2.cl.svn-base0000444000175600017620000000521511545060262022636 0ustar sjpsjp/** * Second part of eta-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_eta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fjp_v, fjm_v; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 0)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 0)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 1)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 1)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 2)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 2)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 3)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 3)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fjp_v = vload(0, flux + tiled_index(k, j+1, i, 4)); fjm_v = vload(0, flux + tiled_index(k, j-1, i, 4)); rsd_v -= ty2 * (fjp_v - fjm_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= ty2 * ( flux[tiled_index(k, j+1, i, 0)] - flux[tiled_index(k, j-1, i, 0)] ); rsd[tiled_index(k, j, i, 1)] -= ty2 * ( flux[tiled_index(k, j+1, i, 1)] - flux[tiled_index(k, j-1, i, 1)] ); rsd[tiled_index(k, j, i, 2)] -= ty2 * ( flux[tiled_index(k, j+1, i, 2)] - flux[tiled_index(k, j-1, i, 2)] ); rsd[tiled_index(k, j, i, 3)] -= ty2 * ( flux[tiled_index(k, j+1, i, 3)] - flux[tiled_index(k, j-1, i, 3)] ); rsd[tiled_index(k, j, i, 4)] -= ty2 * ( flux[tiled_index(k, j+1, i, 4)] - flux[tiled_index(k, j-1, i, 4)] ); } } } } } kernels/vector/bak/rhs/eta/.svn/text-base/rhs_eta1.cl.svn-base0000444000175600017620000000553011545060262022635 0ustar sjpsjp/** * First part of eta-direction flux differences. * Update flux based on u. */ __kernel void rhs_eta1_kernel( __global const double* u, __global double* flux) { int L1, L2; // Set L1. if (west != -1) { L1 = 1; } if (west == -1) { L1 = 2; } // Set L2. if (east != -1) { L2 = ny + 2; } if (east == -1) { L2 = ny + 1; } const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = L1 + jid; j <= L2; j += jsize) { const int ibound = (iend/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u31 = u2_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u2_v; vdouble flux1_v = u1_v * u31; vdouble flux2_v = u2_v * u31 + c2 * (u4_v - q); vdouble flux3_v = u3_v * u31; vdouble flux4_v = (c1 * u4_v - c2 * q) * u31; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } if (iid == 0) { for (; i <= iend; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in the u values. double u0 = u[t_index + 0 * t_offset]; double u1 = u[t_index + 1 * t_offset]; double u2 = u[t_index + 2 * t_offset]; double u3 = u[t_index + 3 * t_offset]; double u4 = u[t_index + 4 * t_offset]; // Update flux. flux[t_index + 0 * t_offset] = u2; const double u31 = u2 / u0; const double q = 0.50e+00 * ( u1 * u1 + u2 * u2 + u3 * u3 ) / u0; flux[t_index + 1 * t_offset] = u1 * u31; flux[t_index + 2 * t_offset] = u2 * u31 + c2 * ( u4 - q ); flux[t_index + 3 * t_offset] = u3 * u31; flux[t_index + 4 * t_offset] = ( c1 * u4 - c2 * q ) * u31; } } } } } kernels/vector/bak/rhs/zeta/.svn/text-base/rhs_zeta_dissipation.cl.svn-base0000444000175600017620000000466411542404557025563 0ustar sjpsjp/** * Fourth-order dissipation in the zeta direction. */ __kernel void rhs_zeta_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { if (k == 1) { for (m = 0; m < 5; m++) { rsd[tiled_index(1, j, i, m)] = rsd[tiled_index(1, j, i, m)] - dssp * ( + 5.0e+00 * u[tiled_index(1, j, i, m)] - 4.0e+00 * u[tiled_index(2, j, i, m)] + u[tiled_index(3, j, i, m)] ); } } else if (k == 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(2, j, i, m)] = rsd[tiled_index(2, j, i, m)] - dssp * ( - 4.0e+00 * u[tiled_index(1, j, i, m)] + 6.0e+00 * u[tiled_index(2, j, i, m)] - 4.0e+00 * u[tiled_index(3, j, i, m)] + u[tiled_index(4, j, i, m)] ); } } else if (k >= 3 && k <= nz - 4) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] = rsd[tiled_index(k, j, i, m)] - dssp * ( u[tiled_index(k-2, j, i, m)] - 4.0e+00 * u[tiled_index(k-1, j, i, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k+1, j, i, m)] + u[tiled_index(k+2, j, i, m)] ); } } else if (k == nz - 3) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-3, j, i, m)] = rsd[tiled_index(nz-3, j, i, m)] - dssp * ( u[tiled_index(nz-5, j, i, m)] - 4.0e+00 * u[tiled_index(nz-4, j, i, m)] + 6.0e+00 * u[tiled_index(nz-3, j, i, m)] - 4.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } else if (k == nz - 2) { for (m = 0; m < 5; m++) { rsd[tiled_index(nz-2, j, i, m)] = rsd[tiled_index(nz-2, j, i, m)] - dssp * ( u[tiled_index(nz-4, j, i, m)] - 4.0e+00 * u[tiled_index(nz-3, j, i, m)] + 5.0e+00 * u[tiled_index(nz-2, j, i, m)] ); } } } } } } kernels/vector/bak/rhs/zeta/.svn/text-base/rhs_zeta4.cl.svn-base0000444000175600017620000000436611542404560023232 0ustar sjpsjp/** * Fourth part of zeta-direction flux differences. * Update rsd based on u and flux. */ __kernel void rhs_zeta4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] + dz1 * tz1 * ( u[tiled_index(k-1, j, i, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k+1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k, j, i, 1)] ) + dz2 * tz1 * ( u[tiled_index(k-1, j, i, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k+1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k, j, i, 2)] ) + dz3 * tz1 * ( u[tiled_index(k-1, j, i, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k+1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k, j, i, 3)] ) + dz4 * tz1 * ( u[tiled_index(k-1, j, i, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k+1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] + tz3 * c3 * c4 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k, j, i, 4)] ) + dz5 * tz1 * ( u[tiled_index(k-1, j, i, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k+1, j, i, 4)] ); } } } } kernels/vector/bak/rhs/zeta/.svn/text-base/rhs_zeta3.cl.svn-base0000444000175600017620000000341211542404556023225 0ustar sjpsjp/** * Third part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta3_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double tmp; double u21k, u31k, u41k, u51k; double u21km1, u31km1, u41km1, u51km1; const double c1 = c1_def; const double c5 = c5_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; u21k = tmp * u[tiled_index(k, j, i, 1)]; u31k = tmp * u[tiled_index(k, j, i, 2)]; u41k = tmp * u[tiled_index(k, j, i, 3)]; u51k = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k-1, j, i, 0)]; u21km1 = tmp * u[tiled_index(k-1, j, i, 1)]; u31km1 = tmp * u[tiled_index(k-1, j, i, 2)]; u41km1 = tmp * u[tiled_index(k-1, j, i, 3)]; u51km1 = tmp * u[tiled_index(k-1, j, i, 4)]; flux[tiled_index(k, j, i, 1)] = tz3 * ( u21k - u21km1 ); flux[tiled_index(k, j, i, 2)] = tz3 * ( u31k - u31km1 ); flux[tiled_index(k, j, i, 3)] = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } } } } kernels/vector/bak/rhs/zeta/.svn/text-base/rhs_zeta2.cl.svn-base0000444000175600017620000000257511542404556023235 0ustar sjpsjp/** * Second part of zeta-direction flux differences. * Update rsd based on u. */ __kernel void rhs_zeta2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] = rsd[tiled_index(k, j, i, 0)] - tz2 * ( flux[tiled_index(k+1, j, i, 0)] - flux[tiled_index(k-1, j, i, 0)] ); rsd[tiled_index(k, j, i, 1)] = rsd[tiled_index(k, j, i, 1)] - tz2 * ( flux[tiled_index(k+1, j, i, 1)] - flux[tiled_index(k-1, j, i, 1)] ); rsd[tiled_index(k, j, i, 2)] = rsd[tiled_index(k, j, i, 2)] - tz2 * ( flux[tiled_index(k+1, j, i, 2)] - flux[tiled_index(k-1, j, i, 2)] ); rsd[tiled_index(k, j, i, 3)] = rsd[tiled_index(k, j, i, 3)] - tz2 * ( flux[tiled_index(k+1, j, i, 3)] - flux[tiled_index(k-1, j, i, 3)] ); rsd[tiled_index(k, j, i, 4)] = rsd[tiled_index(k, j, i, 4)] - tz2 * ( flux[tiled_index(k+1, j, i, 4)] - flux[tiled_index(k-1, j, i, 4)] ); } } } } kernels/vector/bak/rhs/zeta/.svn/text-base/rhs_zeta1.cl.svn-base0000444000175600017620000000267711542404557023240 0ustar sjpsjp/** * First part of zeta-direction flux differences. * Update flux based on u. */ __kernel void rhs_zeta1_kernel( __global const double* u, __global double* flux) { /** * Local variables. */ double q, u41; const double c1 = c1_def; const double c2 = c2_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 0 + kid; k <= nz - 1; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = ist + iid; i <= iend; i += isize) { // Update flux. flux[tiled_index(k, j, i, 0)] = u[tiled_index(k, j, i, 3)]; u41 = u[tiled_index(k, j, i, 3)] / u[tiled_index(k, j, i, 0)]; q = 0.50e+00 * ( u[tiled_index(k, j, i, 1)] * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i, 2)] * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i, 3)] * u[tiled_index(k, j, i, 3)] ) / u[tiled_index(k, j, i, 0)]; flux[tiled_index(k, j, i, 1)] = u[tiled_index(k, j, i, 1)] * u41; flux[tiled_index(k, j, i, 2)] = u[tiled_index(k, j, i, 2)] * u41; flux[tiled_index(k, j, i, 3)] = u[tiled_index(k, j, i, 3)] * u41 + c2 * ( u[tiled_index(k, j, i, 4)] - q ); flux[tiled_index(k, j, i, 4)] = ( c1 * u[tiled_index(k, j, i, 4)] - c2 * q ) * u41; } } } } kernels/vector/bak/rhs/xi/.svn/text-base/rhs_xi_dissipation.cl.svn-base0000444000175600017620000000471711542404560024706 0ustar sjpsjp/** * Fourth-order dissipation step in xi-direction. */ __kernel void rhs_xi_dissipation_kernel( __global const double* u, __global double* rsd) { /** * Local variables. */ int m; double ist1, iend1; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { for (i = 0 + iid; i <= isiz1 + 4; i += isize) { if (north == -1) { for (m = 0; m < 5; m++) { if (i == 3) { rsd[tiled_index(k, j, 3, m)] -= dssp * ( + 5.0e+00 * u[tiled_index(k, j, 3, m)] - 4.0e+00 * u[tiled_index(k, j, 4, m)] + u[tiled_index(k, j, 5, m)] ); } if (i == 4) { rsd[tiled_index(k, j, 4, m)] -= dssp * ( - 4.0e+00 * u[tiled_index(k, j, 3, m)] + 6.0e+00 * u[tiled_index(k, j, 4, m)] - 4.0e+00 * u[tiled_index(k, j, 5, m)] + u[tiled_index(k, j, 6, m)] ); } } } // Update ist1 and iend1 based on north and south. if (north != -1) { ist1 = 2; } if (south != -1) { iend1 = nx + 1; } if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } // If i is in range, update rsd. if (i >= ist1 && i <= iend1) { for (m = 0; m < 5; m++) { rsd[tiled_index(k, j, i, m)] -= dssp * ( u[tiled_index(k, j, i-2, m)] - 4.0e+00 * u[tiled_index(k, j, i-1, m)] + 6.0e+00 * u[tiled_index(k, j, i, m)] - 4.0e+00 * u[tiled_index(k, j, i+1, m)] + u[tiled_index(k, j, i+2, m)] ); } } if (south == -1) { for (m = 0; m < 5; m++) { if (i == nx - 1) { rsd[tiled_index(k, j, nx-1, m)] -= dssp * ( u[tiled_index(k, j, nx-3, m)] - 4.0e+00 * u[tiled_index(k, j, nx-2, m)] + 6.0e+00 * u[tiled_index(k, j, nx-1, m)] - 4.0e+00 * u[tiled_index(k, j, nx, m)] ); } if (i == nx) { rsd[tiled_index(k, j, nx, m)] -= dssp * ( u[tiled_index(k, j, nx-2, m)] - 4.0e+00 * u[tiled_index(k, j, nx-1, m)] + 5.0e+00 * u[tiled_index(k, j, nx, m)] ); } } } } } } } kernels/vector/bak/rhs/xi/.svn/text-base/rhs_xi4.cl.svn-base0000444000175600017620000001212511545060262022354 0ustar sjpsjp/** * Fourth part of xi-direction flux differences. * Update rsd based on u. */ __kernel void rhs_xi4_kernel( __global const double* u, __global double* rsd, __global const double* flux) { // Local variables. const double c3 = c3_def; const double c4 = c4_def; // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd0_v = vload(0, rsd + tiled_index(k, j, i, 0)); vdouble um0_v = vload(0, u + tiled_index(k, j, i-1, 0)); vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble up0_v = vload(0, u + tiled_index(k, j, i+1, 0)); rsd0_v += dx1 * tx1 * ( um0_v - 2.0e+00 * u0_v + up0_v ); vstore(rsd0_v, 0, rsd + tiled_index(k, j, i, 0)); vdouble rsd1_v = vload(0, rsd + tiled_index(k, j, i, 1)); vdouble um1_v = vload(0, u + tiled_index(k, j, i-1, 1)); vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); vdouble up1_v = vload(0, u + tiled_index(k, j, i+1, 1)); vdouble fluxp1_v = vload(0, flux + tiled_index(k, j, i+1, 1)); vdouble flux1_v = vload(0, flux + tiled_index(k, j, i, 1)); rsd1_v += tx3 * c3 * c4 * ( fluxp1_v - flux1_v ) + dx2 * tx1 * ( um1_v - 2.0e+00 * u1_v + up1_v ); vstore(rsd1_v, 0, rsd + tiled_index(k, j, i, 1)); vdouble rsd2_v = vload(0, rsd + tiled_index(k, j, i, 2)); vdouble um2_v = vload(0, u + tiled_index(k, j, i-1, 2)); vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); vdouble up2_v = vload(0, u + tiled_index(k, j, i+1, 2)); vdouble fluxp2_v = vload(0, flux + tiled_index(k, j, i+1, 2)); vdouble flux2_v = vload(0, flux + tiled_index(k, j, i, 2)); rsd2_v += tx3 * c3 * c4 * ( fluxp2_v - flux2_v ) + dx3 * tx1 * ( um2_v - 2.0e+00 * u2_v + up2_v ); vstore(rsd2_v, 0, rsd + tiled_index(k, j, i, 2)); vdouble rsd3_v = vload(0, rsd + tiled_index(k, j, i, 3)); vdouble um3_v = vload(0, u + tiled_index(k, j, i-1, 3)); vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); vdouble up3_v = vload(0, u + tiled_index(k, j, i+1, 3)); vdouble fluxp3_v = vload(0, flux + tiled_index(k, j, i+1, 3)); vdouble flux3_v = vload(0, flux + tiled_index(k, j, i, 3)); rsd3_v += tx3 * c3 * c4 * ( fluxp3_v - flux3_v ) + dx4 * tx1 * ( um3_v - 2.0e+00 * u3_v + up3_v ); vstore(rsd3_v, 0, rsd + tiled_index(k, j, i, 3)); vdouble rsd4_v = vload(0, rsd + tiled_index(k, j, i, 4)); vdouble um4_v = vload(0, u + tiled_index(k, j, i-1, 4)); vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); vdouble up4_v = vload(0, u + tiled_index(k, j, i+1, 4)); vdouble fluxp4_v = vload(0, flux + tiled_index(k, j, i+1, 4)); vdouble flux4_v = vload(0, flux + tiled_index(k, j, i, 4)); rsd4_v += tx3 * c3 * c4 * ( fluxp4_v - flux4_v ) + dx5 * tx1 * ( um4_v - 2.0e+00 * u4_v + up4_v ); vstore(rsd4_v, 0, rsd + tiled_index(k, j, i, 4)); } for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] += dx1 * tx1 * ( u[tiled_index(k, j, i-1, 0)] - 2.0e+00 * u[tiled_index(k, j, i, 0)] + u[tiled_index(k, j, i+1, 0)] ); rsd[tiled_index(k, j, i, 1)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i, 1)] ) + dx2 * tx1 * ( u[tiled_index(k, j, i-1, 1)] - 2.0e+00 * u[tiled_index(k, j, i, 1)] + u[tiled_index(k, j, i+1, 1)] ); rsd[tiled_index(k, j, i, 2)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i, 2)] ) + dx3 * tx1 * ( u[tiled_index(k, j, i-1, 2)] - 2.0e+00 * u[tiled_index(k, j, i, 2)] + u[tiled_index(k, j, i+1, 2)] ); rsd[tiled_index(k, j, i, 3)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i, 3)] ) + dx4 * tx1 * ( u[tiled_index(k, j, i-1, 3)] - 2.0e+00 * u[tiled_index(k, j, i, 3)] + u[tiled_index(k, j, i+1, 3)] ); rsd[tiled_index(k, j, i, 4)] += tx3 * c3 * c4 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i, 4)] ) + dx5 * tx1 * ( u[tiled_index(k, j, i-1, 4)] - 2.0e+00 * u[tiled_index(k, j, i, 4)] + u[tiled_index(k, j, i+1, 4)] ); } } } } kernels/vector/bak/rhs/xi/.svn/text-base/rhs_xi3.cl.svn-base0000444000175600017620000000754611545060262022366 0ustar sjpsjp/** * The third part of xi-direction flux differences. * Update flux (again) based on u. */ __kernel void rhs_xi3_kernel( __global const double* u, __global double* flux) { int L2; const double c1 = c1_def; const double c5 = c5_def; // Update L2 based on south. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const vdouble u0_v = vload(0, u + tiled_index(k, j, i, 0)); vdouble tmp = (vdouble) 1.0e+00 / u0_v; const vdouble u1_v = vload(0, u + tiled_index(k, j, i, 1)); const vdouble u21i = tmp * u1_v; const vdouble u2_v = vload(0, u + tiled_index(k, j, i, 2)); const vdouble u31i = tmp * u2_v; const vdouble u3_v = vload(0, u + tiled_index(k, j, i, 3)); const vdouble u41i = tmp * u3_v; const vdouble u4_v = vload(0, u + tiled_index(k, j, i, 4)); const vdouble u51i = tmp * u4_v; const vdouble u0m_v = vload(0, u + tiled_index(k, j, i-1, 0)); tmp = (vdouble) 1.0e+00 / u0m_v; const vdouble u1m_v = vload(0, u + tiled_index(k, j, i-1, 1)); const vdouble u21im1 = tmp * u1m_v; const vdouble u2m_v = vload(0, u + tiled_index(k, j, i-1, 2)); const vdouble u31im1 = tmp * u2m_v; const vdouble u3m_v = vload(0, u + tiled_index(k, j, i-1, 3)); const vdouble u41im1 = tmp * u3m_v; const vdouble u4m_v = vload(0, u + tiled_index(k, j, i-1, 4)); const vdouble u51im1 = tmp * u4m_v; const vdouble flux1_v = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); const vdouble flux2_v = tx3 * ( u31i - u31im1 ); const vdouble flux3_v = tx3 * ( u41i - u41im1 ); const vdouble flux4_v = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); // Write out the flux vector. vstore(flux1_v, 0, flux + tiled_index(k, j, i, 1)); vstore(flux2_v, 0, flux + tiled_index(k, j, i, 2)); vstore(flux3_v, 0, flux + tiled_index(k, j, i, 3)); vstore(flux4_v, 0, flux + tiled_index(k, j, i, 4)); } for (; i <= L2; i += isize) { double tmp = 1.0e+00 / u[tiled_index(k, j, i, 0)]; const double u21i = tmp * u[tiled_index(k, j, i, 1)]; const double u31i = tmp * u[tiled_index(k, j, i, 2)]; const double u41i = tmp * u[tiled_index(k, j, i, 3)]; const double u51i = tmp * u[tiled_index(k, j, i, 4)]; tmp = 1.0e+00 / u[tiled_index(k, j, i-1, 0)]; const double u21im1 = tmp * u[tiled_index(k, j, i-1, 1)]; const double u31im1 = tmp * u[tiled_index(k, j, i-1, 2)]; const double u41im1 = tmp * u[tiled_index(k, j, i-1, 3)]; const double u51im1 = tmp * u[tiled_index(k, j, i-1, 4)]; flux[tiled_index(k, j, i, 1)] = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux[tiled_index(k, j, i, 2)] = tx3 * ( u31i - u31im1 ); flux[tiled_index(k, j, i, 3)] = tx3 * ( u41i - u41im1 ); flux[tiled_index(k, j, i, 4)] = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } } } } kernels/vector/bak/rhs/xi/.svn/text-base/rhs_xi2.cl.svn-base0000444000175600017620000000530411545060262022353 0ustar sjpsjp/** * Second part of xi-direction flux differences. * Update rsd based on flux. */ __kernel void rhs_xi2_kernel( __global double* rsd, __global const double* flux) { // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = ((iend-1)/vlength)*vlength; for (i = ist + (iid*vlength); i <= ibound; i+= (isize*vlength)) { vdouble rsd_v, fip_v, fim_v; const vdouble tx2_v = (vdouble) tx2; rsd_v = vload(0, rsd + tiled_index(k, j, i, 0)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 0)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 0)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 0)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 1)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 1)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 1)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 1)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 2)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 2)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 2)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 2)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 3)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 3)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 3)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 3)); rsd_v = vload(0, rsd + tiled_index(k, j, i, 4)); fip_v = vload(0, flux + tiled_index(k, j, i+1, 4)); fim_v = vload(0, flux + tiled_index(k, j, i-1, 4)); rsd_v -= tx2_v * (fip_v - fim_v); vstore(rsd_v, 0, rsd + tiled_index(k, j, i, 4)); } if (iid == 0) { for (; i <= iend; i += isize) { rsd[tiled_index(k, j, i, 0)] -= tx2 * ( flux[tiled_index(k, j, i+1, 0)] - flux[tiled_index(k, j, i-1, 0)] ); rsd[tiled_index(k, j, i, 1)] -= tx2 * ( flux[tiled_index(k, j, i+1, 1)] - flux[tiled_index(k, j, i-1, 1)] ); rsd[tiled_index(k, j, i, 2)] -= tx2 * ( flux[tiled_index(k, j, i+1, 2)] - flux[tiled_index(k, j, i-1, 2)] ); rsd[tiled_index(k, j, i, 3)] -= tx2 * ( flux[tiled_index(k, j, i+1, 3)] - flux[tiled_index(k, j, i-1, 3)] ); rsd[tiled_index(k, j, i, 4)] -= tx2 * ( flux[tiled_index(k, j, i+1, 4)] - flux[tiled_index(k, j, i-1, 4)] ); } } } } } kernels/vector/bak/rhs/xi/.svn/text-base/rhs_xi1.cl.svn-base0000444000175600017620000000572311545060262022357 0ustar sjpsjp/** * First part of xi-direction flux differences. * Update flux based on u. */ __kernel void rhs_xi1_kernel( __global const double* u, __global double* flux) { int L1, L2; const double c1 = c1_def; const double c2 = c2_def; // Set L1. if (north != -1) { L1 = 1; } if (north == -1) { L1 = 2; } // Set L2. if (south != -1) { L2 = nx + 2; } if (south == -1) { L2 = nx + 1; } // Calculate i, j and k values for loops. const int iid = get_global_id(0); const int jid = get_global_id(1); const int kid = get_global_id(2); const int isize = get_global_size(0); const int jsize = get_global_size(1); const int ksize = get_global_size(2); int i, j, k; for (k = 1 + kid; k <= nz - 2; k += ksize) { for (j = jst + jid; j <= jend; j += jsize) { const int ibound = (L2/vlength)*vlength; for (i = L1 + (iid*vlength); i <= ibound; i+= (isize*vlength)) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); // Read in u vectors. vdouble u0_v = vload(0, u + t_index + 0 * t_offset); vdouble u1_v = vload(0, u + t_index + 1 * t_offset); vdouble u2_v = vload(0, u + t_index + 2 * t_offset); vdouble u3_v = vload(0, u + t_index + 3 * t_offset); vdouble u4_v = vload(0, u + t_index + 4 * t_offset); const vdouble u21 = u1_v / u0_v; const vdouble q = 0.50e+00 * ( u1_v * u1_v + u2_v * u2_v + u3_v * u3_v ) / u0_v; vdouble flux0_v = u1_v; vdouble flux1_v = u1_v * u21 + c2 * ( u4_v - q ); vdouble flux2_v = u2_v * u21; vdouble flux3_v = u3_v * u21; vdouble flux4_v = (c1 * u4_v - c2 * q) * u21; // Write out flux vectors. vstore(flux0_v, 0, flux + t_index + 0 * t_offset); vstore(flux1_v, 0, flux + t_index + 1 * t_offset); vstore(flux2_v, 0, flux + t_index + 2 * t_offset); vstore(flux3_v, 0, flux + t_index + 3 * t_offset); vstore(flux4_v, 0, flux + t_index + 4 * t_offset); } if (iid == 0) { for (; i <= L2; i += isize) { const int t_index = tiled_index(k, j, i, 0); const int t_offset = isiz3 * (isiz2 + 4) * (isiz1 + 4); flux[t_index + 0 * t_offset] = u[t_index + 1 * t_offset]; const double u21 = u[t_index + 1 * t_offset] / u[t_index + 0 * t_offset]; const double q = 0.50e+00 * ( u[t_index + 1 * t_offset] * u[t_index + 1 * t_offset] + u[t_index + 2 * t_offset] * u[t_index + 2 * t_offset] + u[t_index + 3 * t_offset] * u[t_index + 3 * t_offset] ) / u[t_index + 0 * t_offset]; flux[t_index + 1 * t_offset] = u[t_index + 1 * t_offset] * u21 + c2 * ( u[t_index + 4 * t_offset] - q ); flux[t_index + 2 * t_offset] = u[t_index + 2 * t_offset] * u21; flux[t_index + 3 * t_offset] = u[t_index + 3 * t_offset] * u21; flux[t_index + 4 * t_offset] = ( c1 * u[t_index + 4 * t_offset] - c2 * q ) * u21; } } } } } bcast_inputs.c0000644000175600017620000000107411553632705012114 0ustar sjpsjp#include "applu.h" #include "mpinpb.h" void bcast_inputs() { root = 0; MPI_Bcast(&ipr, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&inorm, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&itmax, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&dt, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&omega, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&tolrsd, 5, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&nx0, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&ny0, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(&nz0, 1, MPI_DOUBLE, root, MPI_COMM_WORLD); } blts.c0000644000175600017620000001253011553632705010361 0ustar sjpsjp// C port of NPB3.2 // subroutine blts #include "size.h" #include "util.h" /** * Compute the regular-sparse, block lower triangular solution. * v <-- ( L-inv ) * v */ void blts ( int ldmx, int ldmy, int ldmz, int nx, int ny, int nz, int starting_k, double omega, double**** v, double ldz, double ldy, double ldx, double d, int ist, int iend, int jst, int jend, int nx0, int ny0, int ipt, int jpt) { /** * Local variables. */ int i, j, k, m; int iex; double tmp, tmp1; double tmat[5][5]; // Receive data from north and west. iex = 0; exchange_1(v, starting_k, iex); const int kstart = max(starting_k, 1); const int kend = min(starting_k + kblock, nz - 1); for (k = kstart; k < kend; k++) { int level = k % kblock; for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { v(k,j,i,m) = v(k,j,i,m) - omega * ( ldz(level,j,i,0)[m] * v(k-1,j,i,0) + ldz(level,j,i,1)[m] * v(k-1,j,i,1) + ldz(level,j,i,2)[m] * v(k-1,j,i,2) + ldz(level,j,i,3)[m] * v(k-1,j,i,3) + ldz(level,j,i,4)[m] * v(k-1,j,i,4) ); } } } for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { v(k,j,i,m) = v(k,j,i,m) - omega * ( ldy(level,j,i,0)[m] * v(k,j-1,i,0) + ldx(level,j,i,0)[m] * v(k,j,i-1,0) + ldy(level,j,i,1)[m] * v(k,j-1,i,1) + ldx(level,j,i,1)[m] * v(k,j,i-1,1) + ldy(level,j,i,2)[m] * v(k,j-1,i,2) + ldx(level,j,i,2)[m] * v(k,j,i-1,2) + ldy(level,j,i,3)[m] * v(k,j-1,i,3) + ldx(level,j,i,3)[m] * v(k,j,i-1,3) + ldy(level,j,i,4)[m] * v(k,j-1,i,4) + ldx(level,j,i,4)[m] * v(k,j,i-1,4) ); } /** * Diagonal block inversion. * Forward elimination. */ for (m = 0; m < 5; m++) { tmat[0][m] = d(level,j,i,0)[m]; tmat[1][m] = d(level,j,i,1)[m]; tmat[2][m] = d(level,j,i,2)[m]; tmat[3][m] = d(level,j,i,3)[m]; tmat[4][m] = d(level,j,i,4)[m]; } // ip = 0. tmp1 = 1.0e+00 / tmat[0][0]; tmp = tmp1 * tmat[0][1]; tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; v(k,j,i,1) = v(k,j,i,1) - v(k,j,i,0) * tmp; tmp = tmp1 * tmat[0][2]; tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; v(k,j,i,2) = v(k,j,i,2) - v(k,j,i,0) * tmp; tmp = tmp1 * tmat[0][3]; tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; v(k,j,i,3) = v(k,j,i,3) - v(k,j,i,0) * tmp; tmp = tmp1 * tmat[0][4]; tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; v(k,j,i,4) = v(k,j,i,4) - v(k,j,i,0) * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat[1][1]; tmp = tmp1 * tmat[1][2]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; v(k,j,i,2) = v(k,j,i,2) - v(k,j,i,1) * tmp; tmp = tmp1 * tmat[1][3]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; v(k,j,i,3) = v(k,j,i,3) - v(k,j,i,1) * tmp; tmp = tmp1 * tmat[1][4]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; v(k,j,i,4) = v(k,j,i,4) - v(k,j,i,1) * tmp; // ip = 2 tmp1 = 1.0e+00 / tmat[2][2]; tmp = tmp1 * tmat[2][3]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; v(k,j,i,3) = v(k,j,i,3) - v(k,j,i,2) * tmp; tmp = tmp1 * tmat[2][4]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; v(k,j,i,4) = v(k,j,i,4) - v(k,j,i,2) * tmp; // ip = 3 tmp1 = 1.0e+00 / tmat[3][3]; tmp = tmp1 * tmat[3][4]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; v(k,j,i,4) = v(k,j,i,4) - v(k,j,i,3) * tmp; /** * Back substitution. */ v(k,j,i,4) = v(k,j,i,4) / tmat[4][4]; v(k,j,i,3) = v(k,j,i,3) - tmat[4][3] * v(k,j,i,4); v(k,j,i,3) = v(k,j,i,3) / tmat[3][3]; v(k,j,i,2) = v(k,j,i,2) - tmat[3][2] * v(k,j,i,3) - tmat[4][2] * v(k,j,i,4); v(k,j,i,2) = v(k,j,i,2) / tmat[2][2]; v(k,j,i,1) = v(k,j,i,1) - tmat[2][1] * v(k,j,i,2) - tmat[3][1] * v(k,j,i,3) - tmat[4][1] * v(k,j,i,4); v(k,j,i,1) = v(k,j,i,1) / tmat[1][1]; v(k,j,i,0) = v(k,j,i,0) - tmat[1][0] * v(k,j,i,1) - tmat[2][0] * v(k,j,i,2) - tmat[3][0] * v(k,j,i,3) - tmat[4][0] * v(k,j,i,4); v(k,j,i,0) = v(k,j,i,0) / tmat[0][0]; } } } // Send data to south and east. iex = 2; exchange_1(v, starting_k + kblock - 1, iex); } blts_cl.c0000644000175600017620000000714411564202133011032 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" /** * Lower triangular solution. */ void blts_cl(int starting_k, cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_offsets_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping) { cl_int status; // Set the arguments to the kernel. int wavefront = 0; status = clSetKernelArg(blts_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(blts_kernel, 1, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(blts_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d); status |= clSetKernelArg(blts_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d); status |= clSetKernelArg(blts_kernel, 4, sizeof(cl_mem), (void*) &columns); status |= clSetKernelArg(blts_kernel, 5, sizeof(cl_mem), (void*) &rows); status |= clSetKernelArg(blts_kernel, 6, sizeof(cl_mem), (void*) &thread_mapping); status |= clSetKernelArg(blts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clSetKernelArg(blts_kernel, 8, sizeof(cl_int), (void*) &starting_k); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for blts: "); // Call the kernels const size_t local = waveblock[0]; const size_t global = waveblock[0] * wavegrid[0]; // Call one kernel per hyperplane step. timer_start(3); for (wavefront = 0; wavefront < (isiz1 + 4) + (isiz2 + 4) + kblock - 2; wavefront++) { status |= clSetKernelArg(blts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clEnqueueNDRangeKernel(subQueue, blts_kernel, 1, NULL, &global, &local, 0, NULL, NULL); } timer_stop(3); wclCheckError(status, CL_SUCCESS, " Could not launch blts kernel: "); } /** * Lower triangular solution. * Note: Uses new k-blocking policy. */ void blts_cl_new(cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_offsets_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping) { cl_int status; // Set the arguments to the kernel. int wavefront = 0; int k = 0; status = clSetKernelArg(blts_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(blts_kernel, 1, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(blts_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d); status |= clSetKernelArg(blts_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d); status |= clSetKernelArg(blts_kernel, 4, sizeof(cl_mem), (void*) &columns); status |= clSetKernelArg(blts_kernel, 5, sizeof(cl_mem), (void*) &rows); status |= clSetKernelArg(blts_kernel, 6, sizeof(cl_mem), (void*) &thread_mapping); status |= clSetKernelArg(blts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clSetKernelArg(blts_kernel, 8, sizeof(cl_int), (void*) &k); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for blts: "); // Call the kernels const size_t local = waveblock[0]; const size_t global = waveblock[0] * wavegrid[0]; while (wavefront < (isiz1 + 4) + (isiz2 + 4) + isiz3 - 2) { // Receive data from north and west. if (wavefront < isiz3) { exchange_1_cl(wavefront, 0); } timer_start(3); // Run kblock wavefront steps. int i; for (i = 0; i < kblock; i++) { status |= clSetKernelArg(blts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clEnqueueNDRangeKernel(subQueue, blts_kernel, 1, NULL, &global, &local, 0, NULL, NULL); wavefront++; } timer_stop(3); wclCheckError(status, CL_SUCCESS, " Could not launch blts kernel: "); // Send data south and east. if (wavefront >= (isiz1 + 4) + (isiz2 + 4) + kblock - 2) { exchange_1_cl(k + kblock - 1, 2); k += kblock; } } } buts.c0000644000175600017620000001304211553632705010371 0ustar sjpsjp#include "size.h" #include "util.h" /** * Compute the regular-sparse, block upper triangular solution. * v <-- ( U-inv ) * v */ void buts ( int ldmx, int ldmy, int ldmz, int nx, int ny, int nz, int starting_k, double omega, double**** v, double*** tv, double d, double udx, double udy, double udz, int ist, int iend, int jst, int jend, int nx0, int ny0, int ipt, int jpt ) { /** * Local variables. */ int i, j, k, m; int iex; double tmp, tmp1; double tmat[5][5]; // Receive data from south and east. iex = 1; exchange_1(v, starting_k, iex); const int kstart = min(starting_k, nz - 2); const int kend = max(starting_k - kblock, 0); for (k = kstart; k > kend; k--) { int level = k % kblock; for (j = jend; j >= jst; j--) { for (i = iend; i >= ist; i--) { for (m = 0; m < 5; m++) { tv[j][i][m] = omega * ( udz(level,j,i,0)[m] * v(k+1,j,i,0) + udz(level,j,i,1)[m] * v(k+1,j,i,1) + udz(level,j,i,2)[m] * v(k+1,j,i,2) + udz(level,j,i,3)[m] * v(k+1,j,i,3) + udz(level,j,i,4)[m] * v(k+1,j,i,4) ); } } } for (j = jend; j >= jst; j--) { for (i = iend; i >= ist; i--) { for (m = 0; m < 5; m++) { tv[j][i][m] = tv[j][i][m] + omega * ( udy(level,j,i,0)[m] * v(k,j+1,i,0) + udx(level,j,i,0)[m] * v(k,j,i+1,0) + udy(level,j,i,1)[m] * v(k,j+1,i,1) + udx(level,j,i,1)[m] * v(k,j,i+1,1) + udy(level,j,i,2)[m] * v(k,j+1,i,2) + udx(level,j,i,2)[m] * v(k,j,i+1,2) + udy(level,j,i,3)[m] * v(k,j+1,i,3) + udx(level,j,i,3)[m] * v(k,j,i+1,3) + udy(level,j,i,4)[m] * v(k,j+1,i,4) + udx(level,j,i,4)[m] * v(k,j,i+1,4) ); } /** * Diagonal block inversion. */ for (m = 0; m < 5; m++) { tmat[0][m] = d(level,j,i,0)[m]; tmat[1][m] = d(level,j,i,1)[m]; tmat[2][m] = d(level,j,i,2)[m]; tmat[3][m] = d(level,j,i,3)[m]; tmat[4][m] = d(level,j,i,4)[m]; } // ip = 0. tmp1 = 1.0e+00 / tmat[0][0]; tmp = tmp1 * tmat[0][1]; tmat[1][1] = tmat[1][1] - tmp * tmat[1][0]; tmat[2][1] = tmat[2][1] - tmp * tmat[2][0]; tmat[3][1] = tmat[3][1] - tmp * tmat[3][0]; tmat[4][1] = tmat[4][1] - tmp * tmat[4][0]; tv[j][i][1] = tv[j][i][1] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][2]; tmat[1][2] = tmat[1][2] - tmp * tmat[1][0]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][0]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][0]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][0]; tv[j][i][2] = tv[j][i][2] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][3]; tmat[1][3] = tmat[1][3] - tmp * tmat[1][0]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][0]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][0]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][0]; tv[j][i][3] = tv[j][i][3] - tv[j][i][0] * tmp; tmp = tmp1 * tmat[0][4]; tmat[1][4] = tmat[1][4] - tmp * tmat[1][0]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][0]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][0]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][0]; tv[j][i][4] = tv[j][i][4] - tv[j][i][0] * tmp; // ip = 1. tmp1 = 1.0e+00 / tmat[1][1]; tmp = tmp1 * tmat[1][2]; tmat[2][2] = tmat[2][2] - tmp * tmat[2][1]; tmat[3][2] = tmat[3][2] - tmp * tmat[3][1]; tmat[4][2] = tmat[4][2] - tmp * tmat[4][1]; tv[j][i][2] = tv[j][i][2] - tv[j][i][1] * tmp; tmp = tmp1 * tmat[1][3]; tmat[2][3] = tmat[2][3] - tmp * tmat[2][1]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][1]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][1]; tv[j][i][3] = tv[j][i][3] - tv[j][i][1] * tmp; tmp = tmp1 * tmat[1][4]; tmat[2][4] = tmat[2][4] - tmp * tmat[2][1]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][1]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][1]; tv[j][i][4] = tv[j][i][4] - tv[j][i][1] * tmp; // ip = 2. tmp1 = 1.0e+00 / tmat[2][2]; tmp = tmp1 * tmat[2][3]; tmat[3][3] = tmat[3][3] - tmp * tmat[3][2]; tmat[4][3] = tmat[4][3] - tmp * tmat[4][2]; tv[j][i][3] = tv[j][i][3] - tv[j][i][2] * tmp; tmp = tmp1 * tmat[2][4]; tmat[3][4] = tmat[3][4] - tmp * tmat[3][2]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][2]; tv[j][i][4] = tv[j][i][4] - tv[j][i][2] * tmp; // ip = 3. tmp = 1.0e+00 / tmat[3][3]; tmp = tmp1 * tmat[3][4]; tmat[4][4] = tmat[4][4] - tmp * tmat[4][3]; tv[j][i][4] = tv[j][i][4] - tv[j][i][3] * tmp; /** * Back substitution. */ tv[j][i][4] = tv[j][i][4] / tmat[4][4]; tv[j][i][3] = tv[j][i][3] - tmat[4][3] * tv[j][i][4]; tv[j][i][3] = tv[j][i][3] / tmat[3][3]; tv[j][i][2] = tv[j][i][2] - tmat[3][2] * tv[j][i][3] - tmat[4][2] * tv[j][i][4]; tv[j][i][2] = tv[j][i][2] / tmat[2][2]; tv[j][i][1] = tv[j][i][1] - tmat[2][1] * tv[j][i][2] - tmat[3][1] * tv[j][i][3] - tmat[4][1] * tv[j][i][4]; tv[j][i][1] = tv[j][i][1] / tmat[1][1]; tv[j][i][0] = tv[j][i][0] - tmat[1][0] * tv[j][i][1] - tmat[2][0] * tv[j][i][2] - tmat[3][0] * tv[j][i][3] - tmat[4][0] * tv[j][i][4]; tv[j][i][0] = tv[j][i][0] / tmat[0][0]; v(k,j,i,0) = v(k,j,i,0) - tv[j][i][0]; v(k,j,i,1) = v(k,j,i,1) - tv[j][i][1]; v(k,j,i,2) = v(k,j,i,2) - tv[j][i][2]; v(k,j,i,3) = v(k,j,i,3) - tv[j][i][3]; v(k,j,i,4) = v(k,j,i,4) - tv[j][i][4]; } } } // Send data to north and west. iex = 3; exchange_1(v, starting_k - kblock + 1, iex); } buts_cl.c0000644000175600017620000000734611564202136011052 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" /** * Upper triangular solution. */ void buts_cl(int starting_k, cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_offsets_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping) { cl_int status; // Set the arguments to the kernel. int wavefront = 0; status = clSetKernelArg(buts_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(buts_kernel, 1, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(buts_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d); status |= clSetKernelArg(buts_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d); status |= clSetKernelArg(buts_kernel, 4, sizeof(cl_mem), (void*) &columns); status |= clSetKernelArg(buts_kernel, 5, sizeof(cl_mem), (void*) &rows); status |= clSetKernelArg(buts_kernel, 6, sizeof(cl_mem), (void*) &thread_mapping); status |= clSetKernelArg(buts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clSetKernelArg(buts_kernel, 8, sizeof(cl_int), (void*) &starting_k); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for buts: "); // Call the kernels const size_t local = waveblock[0]; const size_t global = waveblock[0] * wavegrid[0]; // Call one kernel per hypeprlane step. timer_start(5); for (wavefront = (isiz1 + 4) + (isiz2 + 4) + kblock - 3; wavefront >= 0; wavefront--) { status |= clSetKernelArg(buts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clEnqueueNDRangeKernel(subQueue, buts_kernel, 1, NULL, &global, &local, 0, NULL, NULL); } timer_stop(5); wclCheckError(status, CL_SUCCESS, " Could not launch buts kernel: "); } /** * Upper triangular solution. * Note: Uses new k-blocking policy. */ void buts_cl_new(cl_mem rsd, cl_mem u, cl_mem wavefront_offsets_2d, cl_mem wavefront_offsets_3d, cl_mem columns, cl_mem rows, cl_mem thread_mapping) { cl_int status; // Set the arguments to the kernel. int wavefront = (isiz1 + 4) + (isiz2 + 4) + isiz3 - 3; int k = problem_height - 1; int wave_counter = 0; status = clSetKernelArg(buts_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(buts_kernel, 1, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(buts_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d); status |= clSetKernelArg(buts_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d); status |= clSetKernelArg(buts_kernel, 4, sizeof(cl_mem), (void*) &columns); status |= clSetKernelArg(buts_kernel, 5, sizeof(cl_mem), (void*) &rows); status |= clSetKernelArg(buts_kernel, 6, sizeof(cl_mem), (void*) &thread_mapping); status |= clSetKernelArg(buts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clSetKernelArg(buts_kernel, 8, sizeof(cl_int), (void*) &k); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for buts: "); // Call the kernels const size_t local = waveblock[0]; const size_t global = waveblock[0] * wavegrid[0]; while (wave_counter < (isiz1 + 4) + (isiz2 + 4) + isiz3 - 2) { // Receive data from north and west. if (wave_counter < isiz3) { exchange_1_cl((problem_height - 1) - wave_counter, 1); } timer_start(5); // Run kblock wavefront steps. int i; for (i = 0; i < kblock; i++) { status |= clSetKernelArg(buts_kernel, 7, sizeof(cl_int), (void*) &wavefront); status |= clEnqueueNDRangeKernel(subQueue, buts_kernel, 1, NULL, &global, &local, 0, NULL, NULL); wavefront--; wave_counter++; } timer_stop(5); wclCheckError(status, CL_SUCCESS, " Could not launch buts kernel: "); // Send data south and east. if (wave_counter >= (isiz1 + 4) + (isiz2 + 4) + kblock - 2) { exchange_1_cl(k - kblock + 1, 3); k -= kblock; } } } distribution0000644000175600017620000000665211561532057011721 0ustar sjpsjp if (opt_device == APPLU_DEVICE_CPU) { // Get the number of compute units. size_t comp_units; status = clGetDeviceInfo(subDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &comp_units, NULL); // Go for COARSE parallelism on CPUs; one work-item per work-group, with parallelism in outermost dimension. rhsblock[0] = 1; rhsblock[1] = 1; rhsblock[2] = 1; rhsgrid[0] = 1; rhsgrid[1] = 1; rhsgrid[2] = comp_units; waveblock[0] = 1; waveblock[1] = 1; waveblock[2] = 1; wavegrid[0] = comp_units; wavegrid[1] = 1; wavegrid[2] = 1; ex1iblock[0] = 1; ex1iblock[1] = 1; ex1iblock[2] = 1; ex1igrid[0] = 1; ex1igrid[1] = comp_units; ex1igrid[2] = 1; ex1jblock[0] = 1; ex1jblock[1] = 1; ex1jblock[2] = 1; ex1jgrid[0] = 1; ex1jgrid[1] = comp_units; ex1jgrid[2] = 1; ex3iblock[0] = 1; ex3iblock[1] = 1; ex3iblock[2] = 1; ex3igrid[0] = 1; ex3igrid[1] = comp_units; ex3igrid[2] = 1; ex3jblock[0] = 1; ex3jblock[1] = 1; ex3jblock[2] = 1; ex3jgrid[0] = 1; ex3jgrid[1] = comp_units; ex3jgrid[2] = 1; // Note: && opt_platform == APPLU_PLATFORM_NVIDIA! // TODO: Play with these numbers a little bit. } else if (opt_device == APPLU_DEVICE_GPU && opt_platform == APPLU_PLATFORM_NVIDIA) { rhsblock[0] = 8; rhsblock[1] = 8; rhsblock[2] = 1; rhsgrid[0] = ceil((isiz1 + 4)/ (double) rhsblock[0]); rhsgrid[1] = ceil((isiz2 + 4)/ (double) rhsblock[1]); rhsgrid[2] = isiz3; waveblock[0] = 64; waveblock[1] = 1; waveblock[2] = 1; wavegrid[0] = ceil( ((isiz1 + 4) * (isiz2 + 4)) / (double) waveblock[0] ); wavegrid[1] = 1; wavegrid[2] = 1; ex1iblock[0] = 64; ex1iblock[1] = 1; ex1iblock[2] = 1; ex1igrid[0] = ceil( (iend - ist + 1) / (double) ex1iblock[0] ); ex1igrid[1] = kblock; ex1igrid[2] = 1; ex1jblock[0] = 64; ex1jblock[1] = 1; ex1jblock[2] = 1; ex1jgrid[0] = ceil( (jend - jst + 1) / (double) ex1jblock[0] ); ex1jgrid[1] = kblock; ex1jgrid[2] = 1; ex3iblock[0] = 64; ex3iblock[1] = 1; ex3iblock[2] = 1; ex3igrid[0] = ceil( ny / (double) ex3iblock[0] ); ex3igrid[1] = nz; ex3igrid[2] = 1; ex3jblock[0] = 64; ex3jblock[1] = 1; ex3jblock[2] = 1; ex3jgrid[0] = ceil( nx / (double) ex3jblock[0] ); ex3jgrid[1] = nz; ex3jgrid[2] = 1; } else if (opt_device == APPLU_DEVICE_GPU && opt_platform == APPLU_PLATFORM_AMD) { rhsblock[0] = 64; rhsblock[1] = 1; rhsblock[2] = 1; rhsgrid[0] = 1; rhsgrid[1] = 1; rhsgrid[2] = isiz3; waveblock[0] = 64; waveblock[1] = 1; waveblock[2] = 1; wavegrid[0] = ceil( ((isiz1 + 4) * (isiz2 + 4)) / (double) waveblock[0] ); wavegrid[1] = 1; wavegrid[2] = 1; ex1iblock[0] = 64; ex1iblock[1] = 1; ex1iblock[2] = 1; ex1igrid[0] = ceil( (iend - ist + 1) / (double) ex1iblock[0] ); ex1igrid[1] = kblock; ex1igrid[2] = 1; ex1jblock[0] = 64; ex1jblock[1] = 1; ex1jblock[2] = 1; ex1jgrid[0] = ceil( (jend - jst + 1) / (double) ex1jblock[0] ); ex1jgrid[1] = kblock; ex1jgrid[2] = 1; ex3iblock[0] = 64; ex3iblock[1] = 1; ex3iblock[2] = 1; ex3igrid[0] = ceil( ny / (double) ex3iblock[0] ); ex3igrid[1] = nz; ex3igrid[2] = 1; ex3jblock[0] = 64; ex3jblock[1] = 1; ex3jblock[2] = 1; ex3jgrid[0] = ceil( nx / (double) ex3jblock[0] ); ex3jgrid[1] = nz; ex3jgrid[2] = 1; } erhs.c0000644000175600017620000003525511553632706010370 0ustar sjpsjp// C port of NPB3.2 // subroutine erhs #include "applu.h" /** * Compute the right hand side based on exact solution. */ void erhs() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; int iex; int L1, L2; int ist1, iend1; int jst1, jend1; double dsspm; double xi, eta, zeta; double q; double u21, u31, u41; double tmp; double u21i, u31i, u41i, u51i; double u21j, u31j, u41j, u51j; double u21k, u31k, u41k, u51k; double u21im1, u31im1, u41im1, u51im1; double u21jm1, u31jm1, u41jm1, u51jm1; double u21km1, u31km1, u41km1, u51km1; dsspm = dssp; for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { for (i = 2; i <= nx + 1; i++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = 0.0e+00; } } } } for (k = 0; k <= nz - 1; k++) { zeta = ( (double) (k) ) / (nz - 1); for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; eta = ( (double) (jglob - 2) ) / (ny0 - 1); for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; xi = ( (double) (iglob - 2) ) / (nx0 - 1); for (m = 0; m < 5; m++) { rsd(k,j,i,m) = ce[0][m] + ce[1][m] * xi + ce[2][m] * eta + ce[3][m] * zeta + ce[4][m] * xi * xi + ce[5][m] * eta * eta + ce[6][m] * zeta * zeta + ce[7][m] * xi * xi * xi + ce[8][m] * eta * eta * eta + ce[9][m] * zeta * zeta * zeta + ce[10][m] * xi * xi * xi * xi + ce[11][m] * eta * eta * eta * eta + ce[12][m] * zeta * zeta * zeta * zeta; } } } } /** * xi-direction flux differences. * * iex = flag : iex = 0 north/south communication. * : iex = 1 east/west communication. */ iex = 0; // Communicate and receive/send two rows of data. // TODO: May need to pass this by reference... exchange_3 (rsd, iex); L1 = 1; if (north == -1) { L1 = 2; } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = L1; i <= L2; i++) { flux(k,j,i,0) = rsd(k,j,i,1); u21 = rsd(k,j,i,1) / rsd(k,j,i,0); q = 0.50e+00 * ( rsd(k,j,i,1) * rsd(k,j,i,1) + rsd(k,j,i,2) * rsd(k,j,i,2) + rsd(k,j,i,3) * rsd(k,j,i,3) ) / rsd(k,j,i,0); flux(k,j,i,1) = rsd(k,j,i,1) * u21 + c2 * ( rsd(k,j,i,4) - q ); flux(k,j,i,2) = rsd(k,j,i,2) * u21; flux(k,j,i,3) = rsd(k,j,i,3) * u21; flux(k,j,i,4) = ( c1 * rsd(k,j,i,4) - c2 * q ) * u21; } } } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - tx2 * ( flux(k,j,i+1,m) - flux(k,j,i-1,m) ); } } for (i = ist; i <= L2; i++) { tmp = 1.0e+00 / rsd(k,j,i,0); u21i = tmp * rsd(k,j,i,1); u31i = tmp * rsd(k,j,i,2); u41i = tmp * rsd(k,j,i,3); u51i = tmp * rsd(k,j,i,4); tmp = 1.0e+00 / rsd(k,j,i-1,0); u21im1 = tmp * rsd(k,j,i-1,1); u31im1 = tmp * rsd(k,j,i-1,2); u41im1 = tmp * rsd(k,j,i-1,3); u51im1 = tmp * rsd(k,j,i-1,4); flux(k,j,i,1) = (4.0e+00/3.0e+00) * tx3 * ( u21i - u21im1 ); flux(k,j,i,2) = tx3 * ( u31i - u31im1 ); flux(k,j,i,3) = tx3 * ( u41i - u41im1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1*c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } for (i = ist; i <= iend; i++) { frct(k,j,i,0) = frct(k,j,i,0) + dx1 * tx1 * ( rsd(k,j,i-1,0) - 2.0e+00 * rsd(k,j,i,0) + rsd(k,j,i+1,0) ); frct(k,j,i,1) = frct(k,j,i,1) + tx3 * c3 * c4 * ( flux(k,j,i+1,1) - flux(k,j,i,1) ) + dx2 * tx1 * ( rsd(k,j,i-1,1) - 2.0e+00 * rsd(k,j,i,1) + rsd(k,j,i+1,1) ); frct(k,j,i,2) = frct(k,j,i,2) + tx3 * c3 * c4 * ( flux(k,j,i+1,2) - flux(k,j,i,2) ) + dx3 * tx1 * ( rsd(k,j,i-1,2) - 2.0e+00 * rsd(k,j,i,2) + rsd(k,j,i+1,2) ); frct(k,j,i,3) = frct(k,j,i,3) + tx3 * c3 * c4 * ( flux(k,j,i+1,3) - flux(k,j,i,3) ) + dx4 * tx1 * ( rsd(k,j,i-1,3) - 2.0e+00 * rsd(k,j,i,3) + rsd(k,j,i+1,3) ); frct(k,j,i,4) = frct(k,j,i,4) + tx3 * c3 * c4 * ( flux(k,j,i+1,4) - flux(k,j,i,4) ) + dx5 * tx1 * ( rsd(k,j,i-1,4) - 2.0e+00 * rsd(k,j,i,4) + rsd(k,j,i+1,4) ); } /** * Fourth-order dissipation. */ if (north == -1) { for (m = 0; m < 5; m++) { frct(k,j,3,m) = frct(k,j,3,m) - dsspm * ( + 5.0e+00 * rsd(k,j,3,m) - 4.0e+00 * rsd(k,j,4,m) + rsd(k,j,5,m) ); frct(k,j,4,m) = frct(k,j,4,m) - dsspm * ( - 4.0e+00 * rsd(k,j,3,m) + 6.0e+00 * rsd(k,j,4,m) - 4.0e+00 * rsd(k,j,5,m) + rsd(k,j,6,m) ); } } ist1 = 2; iend1 = nx + 1; if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } for (i = ist1; i <= iend1; i++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - dsspm * ( rsd(k,j,i-2,m) - 4.0e+00 * rsd(k,j,i-1,m) + 6.0e+00 * rsd(k,j,i,m) - 4.0e+00 * rsd(k,j,i+1,m) + rsd(k,j,i+2,m) ); } } if (south == -1) { for (m = 0; m < 5; m++) { frct(k,j,nx-1,m) = frct(k,j,nx-1,m) - dsspm * ( rsd(k,j,nx-3,m) - 4.0e+00 * rsd(k,j,nx-2,m) + 6.0e+00 * rsd(k,j,nx-1,m) - 4.0e+00 * rsd(k,j,nx,m) ); frct(k,j,nx,m) = frct(k,j,nx,m) - dsspm * ( rsd(k,j,nx-2,m) - 4.0e+00 * rsd(k,j,nx-1,m) + 5.0e+00 * rsd(k,j,nx,m)); } } } } /** * eta-direction flux differences. * * iex = flag : iex = 0 north/south communication. * : iex = 1 east/west communication. */ iex = 1; // Communicate and receive/send two rows of data. // TODO: May need to pass this by reference... exchange_3 (rsd, iex); L1 = 1; if (west == -1) { L1 = 2; } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = L1; j <= L2; j++) { flux(k,j,i,0) = rsd(k,j,i,2); u31 = rsd(k,j,i,2) / rsd(k,j,i,0); q = 0.50e+00 * ( rsd(k,j,i,1) * rsd(k,j,i,1) + rsd(k,j,i,2) * rsd(k,j,i,2) + rsd(k,j,i,3) * rsd(k,j,i,3) ) / rsd(k,j,i,0); flux(k,j,i,1) = rsd(k,j,i,1) * u31; flux(k,j,i,2) = rsd(k,j,i,2) * u31 + c2 * (rsd(k,j,i,4) - q); flux(k,j,i,3) = rsd(k,j,i,3) * u31; flux(k,j,i,4) = ( c1 * rsd(k,j,i,4) - c2 * q ) * u31; } } } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = jst; j <= jend; j++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - ty2 * ( flux(k,j+1,i,m) - flux(k,j-1,i,m) ); } } for (j = jst; j <= L2; j++) { tmp = 1.0e+00 / rsd(k,j,i,0); u21j = tmp * rsd(k,j,i,1); u31j = tmp * rsd(k,j,i,2); u41j = tmp * rsd(k,j,i,3); u51j = tmp * rsd(k,j,i,4); tmp = 1.0e+00 / rsd(k,j-1,i,0); u21jm1 = tmp * rsd(k,j-1,i,1); u31jm1 = tmp * rsd(k,j-1,i,2); u41jm1 = tmp * rsd(k,j-1,i,3); u51jm1 = tmp * rsd(k,j-1,i,4); flux(k,j,i,1) = ty3 * ( u21j - u21jm1 ); flux(k,j,i,2) = (4.0e+00/3.0e+00) * ty3 * ( u31j - u31jm1 ); flux(k,j,i,3) = ty3 * ( u41j - u41jm1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } for (j = jst; j <= jend; j++) { frct(k,j,i,0) = frct(k,j,i,0) + dy1 * ty1 * ( rsd(k,j-1,i,0) - 2.0e+00 * rsd(k,j,i,0) + rsd(k,j+1,i,0) ); frct(k,j,i,1) = frct(k,j,i,1) + ty3 * c3 * c4 * ( flux(k,j+1,i,1) - flux(k,j,i,1) ) + dy2 * ty1 * ( rsd(k,j-1,i,1) - 2.0e+00 * rsd(k,j,i,1) + rsd(k,j+1,i,1) ); frct(k,j,i,2) = frct(k,j,i,2) + ty3 * c3 * c4 * ( flux(k,j+1,i,2) - flux(k,j,i,2) ) + dy3 * ty1 * ( rsd(k,j-1,i,2) - 2.0e+00 * rsd(k,j,i,2) + rsd(k,j+1,i,2) ); frct(k,j,i,3) = frct(k,j,i,3) + ty3 * c3 * c4 * ( flux(k,j+1,i,3) - flux(k,j,i,3) ) + dy4 * ty1 * ( rsd(k,j-1,i,3) - 2.0e+00 * rsd(k,j,i,3) + rsd(k,j+1,i,3) ); frct(k,j,i,4) = frct(k,j,i,4) + ty3 * c3 * c4 * ( flux(k,j+1,i,4) - flux(k,j,i,4) ) + dy5 * ty1 * ( rsd(k,j-1,i,4) - 2.0e+00 * rsd(k,j,i,4) + rsd(k,j+1,i,4) ); } /** * Fourth-order dissipation. */ if (west == -1) { for (m = 0; m < 5; m++) { frct(k,3,i,m) = frct(k,3,i,m) - dsspm * ( + 5.0e+00 * rsd(k,3,i,m) - 4.0e+00 * rsd(k,4,i,m) + rsd(k,5,i,m) ); frct(k,4,i,m) = frct(k,4,i,m) - dsspm * ( - 4.0e+00 * rsd(k,3,i,m) + 6.0e+00 * rsd(k,4,i,m) - 4.0e+00 * rsd(k,5,i,m) + rsd(k,6,i,m) ); } } jst1 = 2; jend1 = ny + 1; if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } for (j = jst1; j <= jend1; j++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - dsspm * ( rsd(k,j-2,i,m) - 4.0e+00 * rsd(k,j-1,i,m) + 6.0e+00 * rsd(k,j,i,m) - 4.0e+00 * rsd(k,j+1,i,m) + rsd(k,j+2,i,m) ); } } if (east == -1) { for (m = 0; m < 5; m++) { frct(k,ny-1,i,m) = frct(k,ny-1,i,m) -dsspm * ( rsd(k,ny-3,i,m) - 4.0e+00 * rsd(k,ny-2,i,m) + 6.0e+00 * rsd(k,ny-1,i,m) - 4.0e+00 * rsd(k,ny,i,m) ); frct(k,ny,i,m) = frct(k,ny,i,m) -dsspm * ( rsd(k,ny-2,i,m) - 4.0e+00 * rsd(k,ny-1,i,m) + 5.0e+00 * rsd(k,ny,i,m) ); } } } } /** * zeta-direction flux differences. */ for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (k = 0; k <= nz - 1; k++) { flux(k,j,i,0) = rsd(k,j,i,3); u41 = rsd(k,j,i,3) / rsd(k,j,i,0); q = 0.50e+00 * ( rsd(k,j,i,1) * rsd(k,j,i,1) + rsd(k,j,i,2) * rsd(k,j,i,2) + rsd(k,j,i,3) * rsd(k,j,i,3) ) / rsd(k,j,i,0); flux(k,j,i,1) = rsd(k,j,i,1) * u41; flux(k,j,i,2) = rsd(k,j,i,2) * u41; flux(k,j,i,3) = rsd(k,j,i,3) * u41 + c2 * ( rsd(k,j,i,4) - q ); flux(k,j,i,4) = ( c1 * rsd(k,j,i,4) - c2 * q ) * u41; } for (k = 1; k <= nz - 2; k++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - tz2 * ( flux(k+1,j,i,m) - flux(k-1,j,i,m) ); } } for (k = 1; k <= nz - 1; k++) { tmp = 1.0e+00 / rsd(k,j,i,0); u21k = tmp * rsd(k,j,i,1); u31k = tmp * rsd(k,j,i,2); u41k = tmp * rsd(k,j,i,3); u51k = tmp * rsd(k,j,i,4); tmp = 1.0e+00 / rsd(k-1,j,i,0); u21km1 = tmp * rsd(k-1,j,i,1); u31km1 = tmp * rsd(k-1,j,i,2); u41km1 = tmp * rsd(k-1,j,i,3); u51km1 = tmp * rsd(k-1,j,i,4); flux(k,j,i,1) = tz3 * ( u21k - u21km1 ); flux(k,j,i,2) = tz3 * ( u31k - u31km1 ); flux(k,j,i,3) = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } for (k = 1; k <= nz - 2; k++) { frct(k,j,i,0) = frct(k,j,i,0) + dz1 * tz1 * ( rsd(k+1,j,i,0) - 2.0e+00 * rsd(k,j,i,0) + rsd(k-1,j,i,0) ); frct(k,j,i,1) = frct(k,j,i,1) + tz3 * c3 * c4 * ( flux(k+1,j,i,1) - flux(k,j,i,1) ) + dz2 * tz1 * ( rsd(k+1,j,i,1) - 2.0e+00 * rsd(k,j,i,1) + rsd(k-1,j,i,1) ); frct(k,j,i,2) = frct(k,j,i,2) + tz3 * c3 * c4 * ( flux(k+1,j,i,2) - flux(k,j,i,2) ) + dz3 * tz1 * ( rsd(k+1,j,i,2) - 2.0e+00 * rsd(k,j,i,2) + rsd(k-1,j,i,2) ); frct(k,j,i,3) = frct(k,j,i,3) + tz3 * c3 * c4 * ( flux(k+1,j,i,3) - flux(k,j,i,3) ) + dz4 * tz1 * ( rsd(k+1,j,i,3) - 2.0e+00 * rsd(k,j,i,3) + rsd(k-1,j,i,3) ); frct(k,j,i,4) = frct(k,j,i,4) + tz3 * c3 * c4 * ( flux(k+1,j,i,4) - flux(k,j,i,4) ) + dz5 * tz1 * ( rsd(k+1,j,i,4) - 2.0e+00 * rsd(k,j,i,4) + rsd(k-1,j,i,4) ); } /** * Fourth-order dissipation. */ for (m = 0; m < 5; m++) { frct(1,j,i,m) = frct(1,j,i,m) - dsspm * ( + 5.0e+00 * rsd(1,j,i,m) - 4.0e+00 * rsd(2,j,i,m) + rsd(3,j,i,m) ); frct(2,j,i,m) = frct(2,j,i,m) - dsspm * ( - 4.0e+00 * rsd(1,j,i,m) + 6.0e+00 * rsd(2,j,i,m) - 4.0e+00 * rsd(3,j,i,m) + rsd(4,j,i,m) ); } for (k = 3; k <= nz - 4; k++) { for (m = 0; m < 5; m++) { frct(k,j,i,m) = frct(k,j,i,m) - dsspm * ( rsd(k-2,j,i,m) - 4.0e+00 * rsd(k-1,j,i,m) + 6.0e+00 * rsd(k,j,i,m) - 4.0e+00 * rsd(k+1,j,i,m) + rsd(k+2,j,i,m) ); } } for (m = 0; m < 5; m++) { frct(nz-3,j,i,m) = frct(nz-3,j,i,m) - dsspm * ( rsd(nz-5,j,i,m) - 4.0e+00 * rsd(nz-4,j,i,m) + 6.0e+00 * rsd(nz-3,j,i,m) - 4.0e+00 * rsd(nz-2,j,i,m) ); frct(nz-2,j,i,m) = frct(nz-2,j,i,m) - dsspm * ( rsd(nz-4,j,i,m) - 4.0e+00 * rsd(nz-3,j,i,m) + 5.0e+00 * rsd(nz-2,j,i,m) ); } } } } error.c0000644000175600017620000000245111553632706010550 0ustar sjpsjp// C port of NPB3.2 // subroutine error #include "applu.h" #include "mpinpb.h" #include /** * Compute the solution error. */ void error() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; double tmp; double u000ijk[5], dummy[5]; for (m = 0; m < 5; m++) { errnm[m] = 0.0e+00; dummy[m] = 0.0e+00; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { jglob = jpt + j; for (i = ist; i <= iend; i++) { iglob = ipt + i; //exact(iglob, jglob, k, &u000ijk); exact(iglob, jglob, k, u000ijk); for (m = 0; m < 5; m++) { tmp = ( u000ijk[m] - u(k,j,i,m) ); dummy[m] = dummy[m] + (tmp * tmp); } } } } /** * Compute the global sum of individual contributions to dot product. */ MPI_Allreduce(dummy, errnm, 5, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); for (m = 0; m < 5; m++) { errnm[m] = sqrt ( errnm[m] / ( (nx0-2)*(ny0-2)*(nz0-2) ) ); } /* if (id != 0) { printf("RMS-norm of error in soln. to first pde = %d.\n", errnm[0]); printf("RMS-norm of error in soln. to second pde = %d.\n", errnm[1]); printf("RMS-norm of error in soln. to third pde = %d.\n", errnm[2]); printf("RMS-norm of error in soln. to fourth pde = %d.\n", errnm[3]); printf("RMS-norm of error in soln. to fifth pde = %d.\n", errnm[4]); }*/ } exact.c0000644000175600017620000000202211553632706010515 0ustar sjpsjp// C port of NPB3.2 // subroutine exact( i, j, k, u000ijk ) #include "applu.h" /** * Compute the exact solution at (i, j, k); */ void exact(int i, int j, int k, double u000ijk[5]) { /** * Local variables. */ int m; double xi, eta, zeta; // Note: Originally i-1, j-1, k-1; this may need fixing later. /*xi = ((double) (i - 1)) / (nx0 - 1); eta = ((double) (j - 1)) / (ny0 - 1); zeta = ((double) (k - 1)) / (nz - 1);*/ xi = ((double) (i - 2)) / (nx0 - 1); eta = ((double) (j - 2)) / (ny0 - 1); zeta = ((double) k) / (nz - 1); for (m = 0; m < 5; m++) { u000ijk[m] = ce[0][m] + ce[1][m] * xi + ce[2][m] * eta + ce[3][m] * zeta + ce[4][m] * xi * xi + ce[5][m] * eta * eta + ce[6][m] * zeta * zeta + ce[7][m] * xi * xi * xi + ce[8][m] * eta * eta * eta + ce[9][m] * zeta * zeta * zeta + ce[10][m] * xi * xi * xi * xi + ce[11][m] * eta * eta * eta * eta + ce[12][m] * zeta * zeta * zeta * zeta; //printf("For i = %d, j = %d, k = %d, exact[%d] = %e.\n", i, j, k, m, u000ijk[m]); } } exchange_1.c0000644000175600017620000001307511553632706011425 0ustar sjpsjp// C port of NPB3.2 // subroutine exchange_1(g, k, iex) #include "mpinpb.h" #include "applu.h" #include #include #define g(k,j,i,m) (g[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) /** * iex = 0 : Receive north/west. * iex = 1 : Receive south/east. * iex = 2 : Send south/east. * iex = 3 : Send north/west. */ void exchange_1(double* g, int k, int iex){ int i, j, z; double isend[kblock][iend-ist+1][5], jsend[kblock][jend-jst+1][5]; double irecv[kblock][iend-ist+1][5], jrecv[kblock][jend-jst+1][5]; MPI_Status status; /** * Receive north/west. */ if (iex == 0) { // Receive from north. if (north != -1) { MPI_Recv(jrecv, kblock*(jend-jst+1)*5, MPI_DOUBLE, north, from_n, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z >= 1 && k + z <= nz - 2) { for (j = jst; j <= jend; j++) { g(k + z,j,1,0) = jrecv[z][j - jst][0]; g(k + z,j,1,1) = jrecv[z][j - jst][1]; g(k + z,j,1,2) = jrecv[z][j - jst][2]; g(k + z,j,1,3) = jrecv[z][j - jst][3]; g(k + z,j,1,4) = jrecv[z][j - jst][4]; } } } } // Receive from west. if (west != -1) { MPI_Recv(irecv, kblock*(iend-ist+1)*5, MPI_DOUBLE, west, from_w, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z >= 1 && k + z <= nz - 2) { for (i = ist; i <= iend; i++) { g(k + z,1,i,0) = irecv[z][i - ist][0]; g(k + z,1,i,1) = irecv[z][i - ist][1]; g(k + z,1,i,2) = irecv[z][i - ist][2]; g(k + z,1,i,3) = irecv[z][i - ist][3]; g(k + z,1,i,4) = irecv[z][i - ist][4]; } } } } /** * Receive south/east. */ } else if (iex == 1) { // Receive from south. if (south != -1) { MPI_Recv(jrecv, kblock*(jend-jst+1)*5, MPI_DOUBLE, south, from_s, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (j = jst; j <= jend; j++) { g(k + z - (kblock - 1),j,nx+2,0) = jrecv[z][j - jst][0]; g(k + z - (kblock - 1),j,nx+2,1) = jrecv[z][j - jst][1]; g(k + z - (kblock - 1),j,nx+2,2) = jrecv[z][j - jst][2]; g(k + z - (kblock - 1),j,nx+2,3) = jrecv[z][j - jst][3]; g(k + z - (kblock - 1),j,nx+2,4) = jrecv[z][j - jst][4]; } } } } // Receive from east. if (east != -1) { MPI_Recv(irecv, kblock*(iend-ist+1)*5, MPI_DOUBLE, east, from_e, MPI_COMM_WORLD, &status); for (z = 0; z < kblock; z++) { if (k + z - (kblock - 1) >= 1 && k + z - (kblock - 1) <= nz - 2) { for (i = ist; i <= iend; i++) { g(k + z - (kblock - 1),ny+2,i,0) = irecv[z][i - ist][0]; g(k + z - (kblock - 1),ny+2,i,1) = irecv[z][i - ist][1]; g(k + z - (kblock - 1),ny+2,i,2) = irecv[z][i - ist][2]; g(k + z - (kblock - 1),ny+2,i,3) = irecv[z][i - ist][3]; g(k + z - (kblock - 1),ny+2,i,4) = irecv[z][i - ist][4]; } } } } /** * Send south/east. */ } else if (iex == 2) { // Send south. if (south != -1) { for (z = 0; z < kblock; z++) { if (k + z - (kblock-1) >= 1 && k + z - (kblock-1) <= nz - 2) { for (j = jst; j <= jend; j++) { jsend[z][j - jst][0] = g(k + z - (kblock-1),j,nx+1,0); jsend[z][j - jst][1] = g(k + z - (kblock-1),j,nx+1,1); jsend[z][j - jst][2] = g(k + z - (kblock-1),j,nx+1,2); jsend[z][j - jst][3] = g(k + z - (kblock-1),j,nx+1,3); jsend[z][j - jst][4] = g(k + z - (kblock-1),j,nx+1,4); } } } MPI_Send(jsend, kblock*(jend-jst+1)*5, MPI_DOUBLE, south, from_n, MPI_COMM_WORLD); } // Send east. if (east != -1) { for (z = 0; z < kblock; z++) { if (k + z - (kblock-1) >= 1 && k + z - (kblock-1) <= nz - 2) { for (i = ist; i <= iend; i++) { isend[z][i - ist][0] = g(k + z - (kblock-1),ny+1,i,0); isend[z][i - ist][1] = g(k + z - (kblock-1),ny+1,i,1); isend[z][i - ist][2] = g(k + z - (kblock-1),ny+1,i,2); isend[z][i - ist][3] = g(k + z - (kblock-1),ny+1,i,3); isend[z][i - ist][4] = g(k + z - (kblock-1),ny+1,i,4); } } } MPI_Send(isend, kblock*(iend-ist+1)*5, MPI_DOUBLE, east, from_w, MPI_COMM_WORLD); } /** * Send north/west. */ } else { // Send north. if (north != -1) { for (z = 0; z < kblock; z++) { if (k + z >= 1 && k <= nz - 2) { for (j = jst; j <= jend; j++) { jsend[z][j - jst][0] = g(k + z,j,2,0); jsend[z][j - jst][1] = g(k + z,j,2,1); jsend[z][j - jst][2] = g(k + z,j,2,2); jsend[z][j - jst][3] = g(k + z,j,2,3); jsend[z][j - jst][4] = g(k + z,j,2,4); } } } MPI_Send(jsend, kblock*(jend-jst+1)*5, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } // Send west. if (west != -1) { for (z = 0; z < kblock; z++) { if (k + z >= 1 && k <= nz - 2) { for (i = ist; i <= iend; i++) { isend[z][i - ist][0] = g(k + z,2,i,0); isend[z][i - ist][1] = g(k + z,2,i,1); isend[z][i - ist][2] = g(k + z,2,i,2); isend[z][i - ist][3] = g(k + z,2,i,3); isend[z][i - ist][4] = g(k + z,2,i,4); } } } MPI_Send(isend, kblock*(iend-ist+1)*5, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } } } exchange_1_cl.c0000644000175600017620000002642611553632706012107 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" #include #include /** * iex = 0 : Receive north/west. * iex = 1 : Receive south/east. * iex = 2 : Send south/east. * iex = 3 : Send north/west. */ void exchange_1_cl(int k, int iex) { MPI_Status mpi_status; cl_int status; /** * Receive north/west. */ if (iex == 0) { if (north != -1) { // Receive from north. MPI_Recv(jbuf, kblock * (jend - jst + 1) * 5, MPI_DOUBLE, north, from_n, MPI_COMM_WORLD, &mpi_status); timer_start(10); // Send the buffer to the device. clEnqueueWriteBuffer(subQueue, jbuffer_d, CL_TRUE, 0, kblock * (jend - jst + 1) * 5 * sizeof(double), jbuf, 0, NULL, NULL); // Set the arguments to the kernel. status = clSetKernelArg(ex1_unpack_north_kernel, 0, sizeof(cl_mem), (void*) &jbuffer_d); status |= clSetKernelArg(ex1_unpack_north_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_unpack_north_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_unpack_north_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_unpack_north_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_unpack_north_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_unpack_north: "); size_t local[3] = { ex1jblock[0], ex1jblock[1], ex1jblock[2] }; size_t global[3] = { ex1jblock[0] * ex1jgrid[0], ex1jblock[1] * ex1jgrid[1], ex1jblock[2] * ex1jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_unpack_north_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); } if (west != -1) { // Receive from west. MPI_Recv(ibuf, kblock * (iend - ist + 1) * 5, MPI_DOUBLE, west, from_w, MPI_COMM_WORLD, &mpi_status); timer_start(10); // Send the buffer to the device. clEnqueueWriteBuffer(subQueue, ibuffer_d, CL_TRUE, 0, kblock * (iend - ist + 1) * 5 * sizeof(double), ibuf, 0, NULL, NULL); // Set the arguments to the kernel. status = clSetKernelArg(ex1_unpack_west_kernel, 0, sizeof(cl_mem), (void*) &ibuffer_d); status |= clSetKernelArg(ex1_unpack_west_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_unpack_west_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_unpack_west_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_unpack_west_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_unpack_west_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_unpack_west: "); size_t local[3] = { ex1iblock[0], ex1iblock[1], ex1iblock[2] }; size_t global[3] = { ex1iblock[0] * ex1igrid[0], ex1iblock[1] * ex1igrid[1], ex1iblock[2] * ex1igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_unpack_west_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); } /** * Receive south/east. */ } else if (iex == 1) { if (south != -1) { // Receive from south. MPI_Recv(jbuf, kblock * (jend - jst + 1) * 5, MPI_DOUBLE, south, from_s, MPI_COMM_WORLD, &mpi_status); timer_start(10); // Send the buffer to the device. clEnqueueWriteBuffer(subQueue, jbuffer_d, CL_TRUE, 0, kblock * (jend - jst + 1) * 5 * sizeof(double), jbuf, 0, NULL, NULL); // Set the arguments to the kernel. status = clSetKernelArg(ex1_unpack_south_kernel, 0, sizeof(cl_mem), (void*) &jbuffer_d); status |= clSetKernelArg(ex1_unpack_south_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_unpack_south_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_unpack_south_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_unpack_south_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_unpack_south_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_unpack_south: "); size_t local[3] = { ex1jblock[0], ex1jblock[1], ex1jblock[2] }; size_t global[3] = { ex1jblock[0] * ex1jgrid[0], ex1jblock[1] * ex1jgrid[1], ex1jblock[2] * ex1jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_unpack_south_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); } if (east != -1) { // Receive from east. MPI_Recv(ibuf, kblock * (iend - ist + 1) * 5, MPI_DOUBLE, east, from_e, MPI_COMM_WORLD, &mpi_status); timer_start(10); // Send the buffer to the device. clEnqueueWriteBuffer(subQueue, ibuffer_d, CL_TRUE, 0, kblock * (iend - ist + 1) * 5 * sizeof(double), ibuf, 0, NULL, NULL); // Set the arguments to the kernel. status = clSetKernelArg(ex1_unpack_east_kernel, 0, sizeof(cl_mem), (void*) &ibuffer_d); status |= clSetKernelArg(ex1_unpack_east_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_unpack_east_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_unpack_east_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_unpack_east_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_unpack_east_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_unpack_east: "); size_t local[3] = { ex1iblock[0], ex1iblock[1], ex1iblock[2] }; size_t global[3] = { ex1iblock[0] * ex1igrid[0], ex1iblock[1] * ex1igrid[1], ex1iblock[2] * ex1igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_unpack_east_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); } /** * Send south/east. */ } else if (iex == 2) { if (south != -1) { timer_start(10); // Set the arguments to the kernel. status = clSetKernelArg(ex1_pack_south_kernel, 0, sizeof(cl_mem), (void*) &jbuffer_d); status |= clSetKernelArg(ex1_pack_south_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_pack_south_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_pack_south_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_pack_south_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_pack_south_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_pack_south: "); size_t local[3] = { ex1jblock[0], ex1jblock[1], ex1jblock[2] }; size_t global[3] = { ex1jblock[0] * ex1jgrid[0], ex1jblock[1] * ex1jgrid[1], ex1jblock[2] * ex1jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_pack_south_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, jbuffer_d, CL_TRUE, 0, kblock * (jend - jst + 1) * 5 * sizeof(double), jbuf, 0, NULL, NULL); // Send south. MPI_Send(jbuf, kblock * (jend - jst + 1) * 5, MPI_DOUBLE, south, from_n, MPI_COMM_WORLD); } if (east != -1) { timer_start(10); // Set the arguments to the kernel. status = clSetKernelArg(ex1_pack_east_kernel, 0, sizeof(cl_mem), (void*) &ibuffer_d); status |= clSetKernelArg(ex1_pack_east_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_pack_east_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_pack_east_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_pack_east_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_pack_east_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_pack_east: "); size_t local[3] = { ex1iblock[0], ex1iblock[1], ex1iblock[2] }; size_t global[3] = { ex1iblock[0] * ex1igrid[0], ex1iblock[1] * ex1igrid[1], ex1iblock[2] * ex1igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_pack_east_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, ibuffer_d, CL_TRUE, 0, kblock * (iend - ist + 1) * 5 * sizeof(double), ibuf, 0, NULL, NULL); // Send east. MPI_Send(ibuf, kblock * (iend - ist + 1) * 5, MPI_DOUBLE, east, from_w, MPI_COMM_WORLD); } /** * Send north/west. */ } else { if (north != -1) { timer_start(10); // Set the arguments to the kernel. status = clSetKernelArg(ex1_pack_north_kernel, 0, sizeof(cl_mem), (void*) &jbuffer_d); status |= clSetKernelArg(ex1_pack_north_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_pack_north_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_pack_north_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_pack_north_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_pack_north_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_pack_north: "); size_t local[3] = { ex1jblock[0], ex1jblock[1], ex1jblock[2] }; size_t global[3] = { ex1jblock[0] * ex1jgrid[0], ex1jblock[1] * ex1jgrid[1], ex1jblock[2] * ex1jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_pack_north_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, jbuffer_d, CL_TRUE, 0, kblock * (jend - jst + 1) * 5 * sizeof(double), jbuf, 0, NULL, NULL); // Send north. MPI_Send(jbuf, kblock * (jend - jst + 1) * 5, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } if (west != -1) { timer_start(10); // Set the arguments to the kernel. status = clSetKernelArg(ex1_pack_west_kernel, 0, sizeof(cl_mem), (void*) &ibuffer_d); status |= clSetKernelArg(ex1_pack_west_kernel, 1, sizeof(cl_mem), (void*) &rsd_d); status |= clSetKernelArg(ex1_pack_west_kernel, 2, sizeof(int), (void*) &k); status |= clSetKernelArg(ex1_pack_west_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(ex1_pack_west_kernel, 4, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(ex1_pack_west_kernel, 5, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex1_pack_west: "); size_t local[3] = { ex1iblock[0], ex1iblock[1], ex1iblock[2] }; size_t global[3] = { ex1iblock[0] * ex1igrid[0], ex1iblock[1] * ex1igrid[1], ex1iblock[2] * ex1igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex1_pack_west_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(10); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, ibuffer_d, CL_TRUE, 0, kblock * (iend - ist + 1) * 5 * sizeof(double), ibuf, 0, NULL, NULL); // Send west. MPI_Send(ibuf, kblock * (iend - ist + 1) * 5, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } } } exchange_3.c0000644000175600017620000001350211553632706011422 0ustar sjpsjp// C port of NPB3.2 // subroutine exchange_3 #include "mpinpb.h" #include "applu.h" #define g(k,j,i,m) (g[(((k) * (isiz2 + 4) + (j)) * (isiz1 + 4) + (i)) * 5 + (m)]) /** * Compute the right hand side based on exact solution. */ void exchange_3(double* g, int iex) { /** * Local variables. */ int i, j, k; int ipos1, ipos2; MPI_Request mid; MPI_Status status; /** * Communicate in the south and north directions. */ if (iex == 0) { if (north != -1) { MPI_Irecv(buf1, 10*ny*nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_n, MPI_COMM_WORLD, &mid); } /** * Send south. */ if (south != -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; buf(ipos1,0) = g(k,j,nx,0); buf(ipos1,1) = g(k,j,nx,1); buf(ipos1,2) = g(k,j,nx,2); buf(ipos1,3) = g(k,j,nx,3); buf(ipos1,4) = g(k,j,nx,4); buf(ipos2,0) = g(k,j,nx+1,0); buf(ipos2,1) = g(k,j,nx+1,1); buf(ipos2,2) = g(k,j,nx+1,2); buf(ipos2,3) = g(k,j,nx+1,3); buf(ipos2,4) = g(k,j,nx+1,4); } } MPI_Send(buf, 10*ny*nz, MPI_DOUBLE, south, from_n, MPI_COMM_WORLD); } /** * Receive from north. */ if (north != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; g(k,j,0,0) = buf1(ipos1,0); g(k,j,0,1) = buf1(ipos1,1); g(k,j,0,2) = buf1(ipos1,2); g(k,j,0,3) = buf1(ipos1,3); g(k,j,0,4) = buf1(ipos1,4); g(k,j,1,0) = buf1(ipos2,0); g(k,j,1,1) = buf1(ipos2,1); g(k,j,1,2) = buf1(ipos2,2); g(k,j,1,3) = buf1(ipos2,3); g(k,j,1,4) = buf1(ipos2,4); } } } if (south != -1) { MPI_Irecv(buf1, 10*ny*nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &mid); } /** * Send north. */ if (north != -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; buf(ipos1,0) = g(k,j,3,0); buf(ipos1,1) = g(k,j,3,1); buf(ipos1,2) = g(k,j,3,2); buf(ipos1,3) = g(k,j,3,3); buf(ipos1,4) = g(k,j,3,4); buf(ipos2,0) = g(k,j,2,0); buf(ipos2,1) = g(k,j,2,1); buf(ipos2,2) = g(k,j,2,2); buf(ipos2,3) = g(k,j,2,3); buf(ipos2,4) = g(k,j,2,4); } } MPI_Send(buf, 10*ny*nz, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } /** * Receive from south. */ if (south != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++){ for (j = 2; j <= ny + 1; j++){ ipos1 = k*ny + j - 2; ipos2 = ipos1 + ny*nz; g(k,j,nx+3,0) = buf1(ipos1,0); g(k,j,nx+3,1) = buf1(ipos1,1); g(k,j,nx+3,2) = buf1(ipos1,2); g(k,j,nx+3,3) = buf1(ipos1,3); g(k,j,nx+3,4) = buf1(ipos1,4); g(k,j,nx+2,0) = buf1(ipos2,0); g(k,j,nx+2,1) = buf1(ipos2,1); g(k,j,nx+2,2) = buf1(ipos2,2); g(k,j,nx+2,3) = buf1(ipos2,3); g(k,j,nx+2,4) = buf1(ipos2,4); } } } /** * Communicate in the east and west directions. */ } else { if (west != -1) { MPI_Irecv(buf1, 10*nx*nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_w, MPI_COMM_WORLD, &mid); } /** * Send east. */ if (east != -1) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; buf(ipos1,0) = g(k,ny,i,0); buf(ipos1,1) = g(k,ny,i,1); buf(ipos1,2) = g(k,ny,i,2); buf(ipos1,3) = g(k,ny,i,3); buf(ipos1,4) = g(k,ny,i,4); buf(ipos2,0) = g(k,ny+1,i,0); buf(ipos2,1) = g(k,ny+1,i,1); buf(ipos2,2) = g(k,ny+1,i,2); buf(ipos2,3) = g(k,ny+1,i,3); buf(ipos2,4) = g(k,ny+1,i,4); } } MPI_Send(buf, 10*nx*nz, MPI_DOUBLE, east, from_w, MPI_COMM_WORLD); } /** * Receive from west. */ if (west != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++){ for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; g(k,0,i,0) = buf1(ipos1,0); g(k,0,i,1) = buf1(ipos1,1); g(k,0,i,2) = buf1(ipos1,2); g(k,0,i,3) = buf1(ipos1,3); g(k,0,i,4) = buf1(ipos1,4); g(k,1,i,0) = buf1(ipos2,0); g(k,1,i,1) = buf1(ipos2,1); g(k,1,i,2) = buf1(ipos2,2); g(k,1,i,3) = buf1(ipos2,3); g(k,1,i,4) = buf1(ipos2,4); } } } if (east != -1){ MPI_Irecv(buf1, 10*nx*nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &mid); } /** * Send west. */ if (west != -1) { for (k = 0; k <= nz - 1; k++){ for (i = 2; i <= nx + 1; i++){ ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; buf(ipos1,0) = g(k,3,i,0); buf(ipos1,1) = g(k,3,i,1); buf(ipos1,2) = g(k,3,i,2); buf(ipos1,3) = g(k,3,i,3); buf(ipos1,4) = g(k,3,i,4); buf(ipos2,0) = g(k,2,i,0); buf(ipos2,1) = g(k,2,i,1); buf(ipos2,2) = g(k,2,i,2); buf(ipos2,3) = g(k,2,i,3); buf(ipos2,4) = g(k,2,i,4); } } MPI_Send(buf, 10*nx*nz, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } /** * Receive from east. */ if (east != -1) { MPI_Wait(&mid, &status); for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { ipos1 = k*nx + i - 2; ipos2 = ipos1 + nx*nz; g(k,ny+3,i,0) = buf1(ipos1,0); g(k,ny+3,i,1) = buf1(ipos1,1); g(k,ny+3,i,2) = buf1(ipos1,2); g(k,ny+3,i,3) = buf1(ipos1,3); g(k,ny+3,i,4) = buf1(ipos1,4); g(k,ny+2,i,0) = buf1(ipos2,0); g(k,ny+2,i,1) = buf1(ipos2,1); g(k,ny+2,i,2) = buf1(ipos2,2); g(k,ny+2,i,3) = buf1(ipos2,3); g(k,ny+2,i,4) = buf1(ipos2,4); } } } } } exchange_3_cl.c0000644000175600017620000001777411553632706012117 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" /** * Carry out the boundary swap required by the rhs function. */ void exchange_3_cl(int iex) { MPI_Request mid; MPI_Status mpi_status; cl_int status; /** * Communicate in the south and north directions. */ if (iex == 0) { // Receive from north. if (north != -1) { MPI_Irecv(buf1, 10 * ny * nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_n, MPI_COMM_WORLD, &mid); } // Pack and send south. if (south != -1) { timer_start(11); status = clSetKernelArg(ex3_pack_south_kernel, 0, sizeof(cl_mem), (void*) &buf_d); status |= clSetKernelArg(ex3_pack_south_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_pack_south: "); // Pack. size_t local[3] = { ex3jblock[0], ex3jblock[1], ex3jblock[2] }; size_t global[3] = { ex3jblock[0] * ex3jgrid[0], ex3jblock[1] * ex3jgrid[1], ex3jblock[2] * ex3jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_pack_south_kernel, 3, NULL, global, local, 0, NULL, NULL); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, buf_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf, 0, NULL, NULL); timer_stop(11); // Send south. MPI_Send(buf, 10 * ny * nz, MPI_DOUBLE, south, from_n, MPI_COMM_WORLD); } // Unpack north. if (north != -1) { MPI_Wait(&mid, &mpi_status); timer_start(11); // Copy the buffer to the device. clEnqueueWriteBuffer(subQueue, buf1_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf1, 0, NULL, NULL); status = clSetKernelArg(ex3_unpack_north_kernel, 0, sizeof(cl_mem), (void*) &buf1_d); status |= clSetKernelArg(ex3_unpack_north_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_unpack_north: "); // Unpack. size_t local[3] = { ex3jblock[0], ex3jblock[1], ex3jblock[2] }; size_t global[3] = { ex3jblock[0] * ex3jgrid[0], ex3jblock[1] * ex3jgrid[1], ex3jblock[2] * ex3jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_unpack_north_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(11); } // Receive from south. if (south != -1) { MPI_Irecv(buf1, 10 * ny * nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &mid); } // Pack and send north. if (north != -1) { timer_start(11); status = clSetKernelArg(ex3_pack_north_kernel, 0, sizeof(cl_mem), (void*) &buf_d); status |= clSetKernelArg(ex3_pack_north_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_pack_north: "); // Pack. size_t local[3] = { ex3jblock[0], ex3jblock[1], ex3jblock[2] }; size_t global[3] = { ex3jblock[0] * ex3jgrid[0], ex3jblock[1] * ex3jgrid[1], ex3jblock[2] * ex3jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_pack_north_kernel, 3, NULL, global, local, 0, NULL, NULL); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, buf_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf, 0, NULL, NULL); timer_stop(11); // Send north. MPI_Send(buf, 10 * ny * nz, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } // Unpack south. if (south != -1) { MPI_Wait(&mid, &mpi_status); timer_start(11); // Copy the buffer to the device. clEnqueueWriteBuffer(subQueue, buf1_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf1, 0, NULL, NULL); status = clSetKernelArg(ex3_unpack_south_kernel, 0, sizeof(cl_mem), (void*) &buf1_d); status |= clSetKernelArg(ex3_unpack_south_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_unpack_south: "); // Unpack. size_t local[3] = { ex3jblock[0], ex3jblock[1], ex3jblock[2] }; size_t global[3] = { ex3jblock[0] * ex3jgrid[0], ex3jblock[1] * ex3jgrid[1], ex3jblock[2] * ex3jgrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_unpack_south_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(11); } /** * Communicate in the east and west directions. */ } else { // Receive from west. if (west != -1) { MPI_Irecv(buf1, 10 * nx * nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_w, MPI_COMM_WORLD, &mid); } // Pack and send east. if (east != -1) { timer_start(11); status = clSetKernelArg(ex3_pack_east_kernel, 0, sizeof(cl_mem), (void*) &buf_d); status |= clSetKernelArg(ex3_pack_east_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_pack_east: "); // Pack. size_t local[3] = { ex3iblock[0], ex3iblock[1], ex3iblock[2] }; size_t global[3] = { ex3iblock[0] * ex3igrid[0], ex3iblock[1] * ex3igrid[1], ex3iblock[2] * ex3igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_pack_east_kernel, 3, NULL, global, local, 0, NULL, NULL); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, buf_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf, 0, NULL, NULL); timer_stop(11); // Send east. MPI_Send(buf, 10 * nx * nz, MPI_DOUBLE, east, from_w, MPI_COMM_WORLD); } // Unpack west. if (west != -1) { MPI_Wait(&mid, &mpi_status); timer_start(11); // Copy the buffer to the device. clEnqueueWriteBuffer(subQueue, buf1_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf1, 0, NULL, NULL); status = clSetKernelArg(ex3_unpack_west_kernel, 0, sizeof(cl_mem), (void*) &buf1_d); status |= clSetKernelArg(ex3_unpack_west_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_unpack_west: "); // Unpack. size_t local[3] = { ex3iblock[0], ex3iblock[1], ex3iblock[2] }; size_t global[3] = { ex3iblock[0] * ex3igrid[0], ex3iblock[1] * ex3igrid[1], ex3iblock[2] * ex3igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_unpack_west_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(11); } // Receive from east. if (east != -1){ MPI_Irecv(buf1, 10 * nx * nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &mid); } // Pack and send west. if (west != -1) { timer_start(11); status = clSetKernelArg(ex3_pack_west_kernel, 0, sizeof(cl_mem), (void*) &buf_d); status |= clSetKernelArg(ex3_pack_west_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_pack_west: "); // Pack. size_t local[3] = { ex3iblock[0], ex3iblock[1], ex3iblock[2] }; size_t global[3] = { ex3iblock[0] * ex3igrid[0], ex3iblock[1] * ex3igrid[1], ex3iblock[2] * ex3igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_pack_west_kernel, 3, NULL, global, local, 0, NULL, NULL); // Retrieve buffer from the device. clEnqueueReadBuffer(subQueue, buf_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf, 0, NULL, NULL); timer_stop(11); // Send west. MPI_Send(buf, 10 * nx * nz, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } // Unpack east. if (east != -1) { MPI_Wait(&mid, &mpi_status); timer_start(11); // Copy the buffer to the device. clEnqueueWriteBuffer(subQueue, buf1_d, CL_TRUE, 0, 10 * isiz3 * isiz2 * sizeof(double), buf1, 0, NULL, NULL); status = clSetKernelArg(ex3_unpack_east_kernel, 0, sizeof(cl_mem), (void*) &buf1_d); status |= clSetKernelArg(ex3_unpack_east_kernel, 1, sizeof(cl_mem), (void*) &u_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for ex3_unpack_east: "); // Unpack. size_t local[3] = { ex3iblock[0], ex3iblock[1], ex3iblock[2] }; size_t global[3] = { ex3iblock[0] * ex3igrid[0], ex3iblock[1] * ex3igrid[1], ex3iblock[2] * ex3igrid[2] }; status = clEnqueueNDRangeKernel(subQueue, ex3_unpack_east_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(11); } } } exchange_4.c0000644000175600017620000000255311553632706011427 0ustar sjpsjp//subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1) #include "mpinpb.h" #include "applu.h" void exchange_4(double** g, double** h, int ibeg, int ifin1, int jbeg, int jfin1){ /** * Local variables. */ int i, j; int ny2; double dum[1024]; MPI_Request msgid1, msgid3; MPI_Status status; ny2 = ny + 2; /** * Communicate in the east and west directions. */ // Receive from east. if (jfin1 == ny) { MPI_Irecv(dum, 2*nx, MPI_DOUBLE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &msgid3); MPI_Wait(&msgid3, &status); for (i = 1; i <= nx; i++) { g[ny+1][i] = dum[i-1]; h[ny+1][i] = dum[i+nx-1]; } } // Send west. if (jbeg == 1) { for (i = 1; i <= nx; i++) { dum[i-1] = g[1][i]; dum[i+nx-1] = h[1][i]; } MPI_Send(dum, 2*nx, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } /** * Communicate in the south and north directions. */ // Receive from south. if (ifin1 == nx) { MPI_Irecv(dum, 2*ny2, MPI_DOUBLE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &msgid1); MPI_Wait(&msgid1, &status); for (j = 0; j <= ny + 1; j++){ g[j][nx+1] = dum[j+1-1]; h[j][nx+1] = dum[j+ny2+1-1]; } } // Send north. if (ibeg == 1) { for (j = 0; j <= ny+1; j++) { dum[j+1-1] = g[j][1]; dum[j+ny2+1-1] = h[j][1]; } MPI_Send(dum, 2*ny2, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } } exchange_5.c0000644000175600017620000000354311553632706011430 0ustar sjpsjp//subroutine exchange_5(g,ibeg,ifin1) /** * compute the right hand side based on exact solution */ /** implicit none include 'mpinpb.h' include 'applu.incl' */ #include "mpinpb.h" #include "applu.h" void exchange_5(double** g, int ibeg, int ifin1){ /** * input parameters */ /* double precision g(0:isiz2+1,0:isiz3+1) integer ibeg, ifin1 */ /** * local variables */ /** integer k double precision dum(1024) integer msgid1 integer status(MPI_status_SIZE) integer IERROR */ int k; double dum[1024]; MPI_Request msgid1; MPI_Status status; /** * communicate in the south and north directions */ /** * receive from south */ /** if (ifin1.eq.nx) then call MPI_IRECV( dum, > nz, > dp_type, > MPI_ANY_SOURCE, > from_s, > MPI_COMM_WORLD, > msgid1, > IERROR ) call MPI_WAIT( msgid1, status, IERROR ) do k = 1,nz g(nx+1,k) = dum(k) end do end if */ if( ifin1 == nx){ MPI_Irecv(dum, nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_s, MPI_COMM_WORLD, &msgid1); MPI_Wait(&msgid1, &status); for(k = 1; k <= nz; k++){ g[k][nx+1] =dum[k-1]; } } /** * send north */ /** if (ibeg.eq.1) then do k = 1,nz dum(k) = g(1,k) end do call MPI_SEND( dum, > nz, > dp_type, > north, > from_s, > MPI_COMM_WORLD, > IERROR ) end if */ if(ibeg == 1){ for(k = 1; k <= nz; k++){ dum[k-1] = g[k][1]; } MPI_Send( dum, nz, MPI_DOUBLE, north, from_s, MPI_COMM_WORLD); } // return // end } exchange_6.c0000644000175600017620000000351611553632706011431 0ustar sjpsjp//subroutine exchange_6(g,jbeg,jfin1) /** * compute the right hand side based on exact solution */ // implicit none // include 'mpinpb.h' // include 'applu.incl' #include "mpinpb.h" #include "applu.h" void exchange_6(double** g, int jbeg, int jfin1){ /** * input parameters */ // double precision g(0:isiz2+1,0:isiz3+1) // integer jbeg, jfin1 /** * local parameters */ /** integer k double precision dum(1024) integer msgid3 integer status(MPI_status_SIZE) integer IERROR */ int k; double dum[1024]; MPI_Request msgid3; MPI_Status status; /** * communicate in the east and west directions */ /** * receive from east */ /** if (jfin1.eq.ny) then call MPI_IRECV( dum, > nz, > dp_type, > MPI_ANY_SOURCE, > from_e, > MPI_COMM_WORLD, > msgid3, > IERROR ) call MPI_WAIT( msgid3, status, IERROR ) do k = 1,nz g(ny+1,k) = dum(k) end do end if */ if( jfin1 == ny){ MPI_Irecv(dum, nz, MPI_DOUBLE, MPI_ANY_SOURCE, from_e, MPI_COMM_WORLD, &msgid3); MPI_Wait(&msgid3, &status); for(k = 1; k <= nz; k++){ g[k][ny+1] =dum[k-1]; } } /* * send west */ /** if (jbeg.eq.1) then do k = 1,nz dum(k) = g(1,k) end do call MPI_SEND( dum, > nz, > dp_type, > west, > from_e, > MPI_COMM_WORLD, > IERROR ) end if */ if(jbeg == 1){ for(k = 1; k <= nz; k++){ dum[k-1] = g[k][1]; } MPI_Send( dum, nz, MPI_DOUBLE, west, from_e, MPI_COMM_WORLD); } // return // end } init_comm.c0000644000175600017620000000133011553632706011370 0ustar sjpsjp// C port of NPB3.2 // subroutine init-comm #include "applu.h" #include "mpinpb.h" /** * initialize MPI and establish rank and size * * This is a module in the MPI implementation of LUSSOR * pseudo application from the NAS Parallel Benchmarks. */ void init_comm(int argc, char** argv) { // Initialize MPI communication. MPI_Init(&argc, &argv); // Establish the global rank of this process. MPI_Comm_rank(MPI_COMM_WORLD, &id); // Establish the size of the global group. MPI_Comm_size(MPI_COMM_WORLD, &num); /** * num - number of nodes (processors) * nodedim - a function such that it computes the exponent where num = 2 ^ nodedim * i.e. ndim is the square-root of num. */ ndim = nodedim(num); } l2norm_cl.c0000644000175600017620000000243111553632707011305 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" #include /** * Carry out global reduction. */ void l2norm_cl(int nz0, double* sum_h, cl_mem rsd, cl_mem sum_d) { cl_int status; // Set the arguments to the kernel. status = clSetKernelArg(l2norm_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(l2norm_kernel, 1, sizeof(cl_mem), (void*) &sum_d); status |= clSetKernelArg(l2norm_kernel, 2, sizeof(cl_int), (void*) &nz0); wclCheckError(status, CL_SUCCESS, " Could not set arguments to l2norm kernel: "); // Call the kernel. // TODO: Implement a better global reduction! size_t local = 1; size_t global = 5; timer_start(7); status = clEnqueueNDRangeKernel(subQueue, l2norm_kernel, 1, NULL, &global, &local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch l2norm kernel: "); // Get the results back. clEnqueueReadBuffer(subQueue, sum_d, CL_TRUE, 0, 5 * sizeof(double), sum_h, 0, NULL, NULL); // Compute the global sum of individual contributions to dot product. MPI_Allreduce(MPI_IN_PLACE, sum_h, 5, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); int m; for (m = 0; m < 5; m++) { sum_h[m] = sqrt ( sum_h[m] / ( (nx0-2)*(ny0-2)*(nz0-2) ) ); } timer_stop(7); } main.c0000644000175600017620000001226611564202472010343 0ustar sjpsjp/**** * * * This benchmark is a modification of the LU benchmark belonging * * to the NAS Parallel Benchmark 3.2 suite. * * * * Permission to use, copy, distribute and modify this software * * for any purpose with or without fee is hereby granted. We * * request, however, that all derived work reference the NAS * * Parallel Benchmarks 3.2. This software is provided "as is" * * without express or implied warranty. * * * * Information on NPB 3.2, including the technical report, the * * original specifications, source code, results and information * * on how to submit new results, is available at: * * * * http://www.nas.nasa.gov/Software/NPB/ * * * * *** * Original Authors: * * S. Weeratunga * * V. Venkatakrishnan * * E. Barszcz * * M. Yarrow * * * * *** * *** * * * OpenCL Port: * * S.J. Pennycook * * O.J Perks * * * * **/ #include "applu.h" #include "applu_cl.h" #include "mpinpb.h" #include "globals.h" #include "util.h" #include "timers.h" #include #include #include /** * Main method. */ int main(int argc, char* argv[]) { // Initialize communications. init_comm(argc, argv); timer_init(12); // Parse options. parse_options(argc, argv); // Allocate memory. allocate_buffers(); if (id == 0) { printf("===========================================\n"); } // Read input data. read_input(); // Set up processor grid. proc_grid(); // Determine the neighbors. neighbors(); // Set up sub-domain sizes - understand more later for per processor work. subdomain(); if (id == 0) { printf(" k-blocking = %d\n", kblock); if (opt_device == APPLU_DEVICE_CPU) { printf(" Device = CPU\n"); } else if (opt_device == APPLU_DEVICE_GPU) { printf(" Device = GPU\n"); } if (opt_layout == APPLU_LAYOUT_SOA) { printf(" Memory Layout = Struct of Arrays\n"); } else if (opt_layout == APPLU_LAYOUT_AOS) { printf(" Memory Layout = Array of Structs\n"); } if (opt_kernels == APPLU_KERNELS_SCALAR) { printf(" Kernels = Scalar\n"); } else if (opt_kernels == APPLU_KERNELS_VECTOR) { printf(" Kernels = Vector\n"); } if (opt_blocking == APPLU_BLOCKING_OLD) { printf(" Blocking Policy = Original\n"); } else if (opt_blocking == APPLU_BLOCKING_NEW) { printf(" Blocking Policy = New\n"); } if (opt_distribution == APPLU_DISTRIBUTION_FINE) { printf(" Work-item Distribution = Fine\n"); } else if (opt_distribution == APPLU_DISTRIBUTION_COARSE) { printf(" Work-item Distribution = Coarse\n"); } if (opt_fission == APPLU_FISSION_OFF) { printf(" Fission = Off\n"); } else if (opt_fission == APPLU_FISSION_ON) { printf(" Fission = On\n"); } printf("===========================================\n"); } // Set up coefficients. setcoeff(); // Set the masks required for comm. sethyper(); // Set the boundary values for dependent variables. setbv(); // Set the initial values for dependent variables. setiv(); // Compute the forcing term based on prescribed exact solution. erhs(); // Perform one SSOR iteration to touch all data and program pages. ssor_cl(1); // Reset the boundary and initial values. setbv(); setiv(); // Perform the SSOR iterations - timed. ssor_cl(itmax); // Compute the solution error. error(); // Compute the surface integral. pintgr(); // Verification test. if (id == 0) { verified = verify( rsdnm, errnm, frc, class ); #ifdef TIMING print_timers(); #else printf("%s:\t%f\t%f\n", "SSOR :", cputime, walltime); #endif } // Free memory for our arrays. free_buffers(); timer_finalize(); MPI_Finalize(); return 0; } Makefile0000644000175600017620000000326211753220710010702 0ustar sjpsjp# Default to single device. NDEVICES ?= 1 # Default to class S. CLASS ?= S # Default to k-blocking of 1, with the GPU solution blocking on 1 internally. KBLOCK ?= 1 DEBUG ?= 0 # Define compilers. CC = mpicc # Normal CFLAGS CFLAGS = -O5 -native -xprefetch -xunroll=8 -xipo -xvector -I ./headers -I $(AMDAPPSDKROOT)/include #CFLAGS = -O2 -funroll-loops -msse3 -Wall -I ./headers -I $(AMDAPPSDKROOT)/include LDFLAGS = -L $(AMDAPPSDKROOT)/lib/x86_64/ -L /home/saw/lib/ LFLAGS = -lOpenCL -lWCL # Check for DEBUG ifeq ($(DEBUG),1) CFLAGS = -O0 -g -I ./headers -I $(AMDAPPSDKROOT)/include endif # Check for TIMING ifeq ($(TIMING),1) CFLAGS := $(CFLAGS) -DTIMING endif # Define executable name. ifeq ($(TIMING),1) EXE = ./bin/applu.$(CLASS).$(NDEVICES).$(KBLOCK).TIMING else EXE = ./bin/applu.$(CLASS).$(NDEVICES).$(KBLOCK) endif OBJS = main.o \ bcast_inputs.o \ erhs.o \ error.o \ exact.o \ exchange_1_cl.o \ exchange_3.o \ exchange_3_cl.o \ exchange_4.o \ exchange_5.o \ exchange_6.o \ init_comm.o \ neighbors.o \ nodedim.o \ pintgr.o \ proc_grid.o \ read_input.o \ rhs.o \ setbv.o \ setcoeff.o \ sethyper.o \ setiv.o \ subdomain.o \ verify.o \ util.o \ timers.o \ ssor_cl.o \ blts_cl.o \ buts_cl.o \ l2norm_cl.o \ pre_cl.o \ post_cl.o \ rhs_cl.o \ util_cl.o \ rearrangement_cl.o all: clean setparams applu applu: $(OBJS) $(CC) $(CFLAGS) $(LDFLAGS) -o $(EXE) $(OBJS) -lm $(LFLAGS) .c.o: $*.c applu.h $(CC) $(CFLAGS) -c $*.c setparams: $(CC) $(CFLAGS) -o setparams setparams.c -lm ./setparams $(NDEVICES) $(CLASS) $(KBLOCK) mv size.h ./headers/size.h clean: rm -f *.o core gmon.out setparams *~ rm -f size.h ./headers/size.h neighbors.c0000644000175600017620000000200111553632707011367 0ustar sjpsjp // subroutine neighbors () // implicit none // include 'applu.incl' #include "applu.h" void neighbors(){ /** * figure out the neighbors and their wrap numbers for each processor */ south = -1; east = -1; north = -1; west = -1; /** if (row.gt.1) then north = id -1 else north = -1 end if */ if( row > 1 ){ north = id - 1; }else{ north = -1; } /** if (row.lt.xdim) then south = id + 1 else south = -1 end if */ if( row < xdim){ south = id + 1; }else{ south =-1; } /** if (col.gt.1) then west = id- xdim else west = -1 end if */ if( col > 1){ west = id - xdim; }else{ west = -1; } /** if (col.lt.ydim) then east = id + xdim else east = -1 end if */ if( col < ydim){ east = id + xdim; }else{ east =-1; } // return // end } nodedim.c0000644000175600017620000000074711553632707011045 0ustar sjpsjp#include // integer function nodedim(num) int nodedim(int num){ /** * compute the exponent where num = 2**nodedim * NOTE: assumes a power-of-two number of nodes */ // implicit none /** * input parameters */ // integer num /** * local variables */ //double precision fnum double fnum; //fnum = dble(num) fnum = (double) num; //nodedim = log(fnum)/log(2.0d+0) + 0.00001 return log(fnum)/log((double)2.0) + 0.00001; // return // end } pintgr.c0000644000175600017620000001414111553632707010722 0ustar sjpsjp#include "applu.h" #include "mpinpb.h" #include #include /** * Set up the sub-domains for integration in each processor. */ void pintgr() { /** * Local variables. */ int i, j, k; int ibeg, ifin, ifin1; int jbeg, jfin, jfin1; int iglob, iglob1, iglob2; int jglob, jglob1, jglob2; int ind1, ind2; double **phi1, **phi2; double frc1, frc2, frc3; double dummy; // Allocate for phi1 and phi2. phi1 = malloc( (isiz3 + 2) * sizeof(double*) ); phi2 = malloc( (isiz3 + 2) * sizeof(double*) ); // Allocate arrays. for (k = 0; k < (isiz3 + 2); k++) { phi1[k] = malloc( (isiz2 + 2) * sizeof(double) ); phi2[k] = malloc( (isiz2 + 2) * sizeof(double) ); } // TODO: Chances are a lot of these numbers are way off or something. ibeg = nx + 1; ifin = 0; iglob1 = ipt + 1; iglob2 = ipt + nx; if (iglob1 >= ii1 && iglob2 < ii2 + nx) { ibeg = 1; } if (iglob1 > ii1 - nx && iglob2 <= ii2) { ifin = nx; } if (ii1 >= iglob1 && ii1 <= iglob2) { ibeg = ii1 - ipt; } if (ii2 >= iglob1 && ii2 <= iglob2) { ifin = ii2 - ipt; } jbeg = ny + 1; jfin = 0; jglob1 = jpt + 1; jglob2 = jpt + ny; if (jglob1 >= ji1 && jglob2 < ji2 + ny) { jbeg = 1; } if (jglob1 > ji1 - ny && jglob2 <= ji2) { jfin = ny; } if (ji1 >= jglob1 && ji1 <= jglob2) { jbeg = ji1 - jpt; } if (ji2 >= jglob1 && ji2 <= jglob2) { jfin = ji2 - jpt; } ifin1 = ifin; jfin1 = jfin; if (ipt + ifin1 == ii2) { ifin1 = ifin - 1; } if (jpt + jfin1 == ji2) { jfin1 = jfin - 1; } // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; k = ki1 - 1; phi1[j-1][i-1] = c2 * ( u(k,j,i,4) - 0.50e+00 * ( u(k,j,i,1) * u(k,j,i,1) + u(k,j,i,2) * u(k,j,i,2) + u(k,j,i,3) * u(k,j,i,3) ) / u(k,j,i,0) ); k = ki2 - 1; phi2[j-1][i-1] = c2 * ( u(k,j,i,4) - 0.50e+00 * ( u(k,j,i,1) * u(k,j,i,1) + u(k,j,i,2) * u(k,j,i,2) + u(k,j,i,3) * u(k,j,i,3) ) / u(k,j,i,0) ); } } // Communicate in i and j directions. exchange_4(phi1, phi2, ibeg, ifin1, jbeg, jfin1); frc1 = 0.0e+00; for (j = jbeg; j <= jfin1; j++) { for (i = ibeg; i <= ifin1; i++) { frc1 = frc1 + ( phi1[j][i] + phi1[j][i+1] + phi1[j+1][i] + phi1[j+1][i+1] + phi2[j][i] + phi2[j][i+1] + phi2[j+1][i] + phi2[j+1][i+1] ); } } // Compute the global sum of individual contributions to frc1. dummy = frc1; MPI_Allreduce(&dummy, &frc1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); frc1 = dxi * deta * frc1; // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } jglob = jpt + jbeg; ind1 = 0; if (jglob == ji1) { ind1 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; phi1[k+1][i-1] = c2 * ( u(k,jbeg+1,i,4) - 0.50e+00 * ( u(k,jbeg+1,i,1) * u(k,jbeg+1,i,1) + u(k,jbeg+1,i,2) * u(k,jbeg+1,i,2) + u(k,jbeg+1,i,3) * u(k,jbeg+1,i,3) ) / u(k,jbeg+1,i,0) ); } } } jglob = jpt + jfin; ind2 = 0; if (jglob == ji2) { ind2 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (i = ibeg + 1; i <= ifin + 1; i++) { iglob = ipt + i; phi2[k+1][i-1] = c2 * ( u(k,jfin+1,i,4) - 0.50e+00 * ( u(k,jfin+1,i,1) * u(k,jfin+1,i,1) + u(k,jfin+1,i,2) * u(k,jfin+1,i,2) + u(k,jfin+1,i,3) * u(k,jfin+1,i,3) ) / u(k,jfin+1,i,0) ); } } } // Communicate in i direction. if (ind1 == 1) { exchange_5(phi1, ibeg, ifin1); } if (ind2 == 1) { exchange_5(phi2, ibeg, ifin1); } frc2 = 0.0e+00; for (k = ki1; k <= ki2 - 1; k++) { for (i = ibeg; i <= ifin1; i++) { frc2 = frc2 + ( phi1[k][i] + phi1[k][i+1] + phi1[k+1][i] + phi1[k+1][i+1] + phi2[k][i] + phi2[k][i+1] + phi2[k+1][i] + phi2[k+1][i+1] ); } } // Compute the global sum of individual contributions to frc2. dummy = frc2; MPI_Allreduce(&dummy, &frc2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); frc2 = dxi * dzeta * frc2; // Initialize. for (i = 0; i <= isiz2 + 1; i++) { for (k = 0; k <= isiz3 + 1; k++) { phi1[k][i] = 0; phi2[k][i] = 0; } } iglob = ipt + ibeg; ind1 = 0; if (iglob == ii1) { ind1 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; phi1[k+1][j-1] = c2 * ( u(k,j,ibeg+1,4) - 0.50e+00 * ( u(k,j,ibeg+1,1) * u(k,j,ibeg+1,1) + u(k,j,ibeg+1,2) * u(k,j,ibeg+1,2) + u(k,j,ibeg+1,3) * u(k,j,ibeg+1,3) ) / u(k,j,ibeg+1,0) ); } } } iglob = ipt + ifin; ind2 = 0; if (iglob == ii2) { ind2 = 1; for (k = ki1 - 1; k <= ki2 - 1; k++) { for (j = jbeg + 1; j <= jfin + 1; j++) { jglob = jpt + j; phi2[k+1][j-1] = c2 * ( u(k,j,ifin+1,4) - 0.50e+00 * ( u(k,j,ifin+1,1) * u(k,j,ifin+1,1) + u(k,j,ifin+1,2) * u(k,j,ifin+1,2) + u(k,j,ifin+1,3) * u(k,j,ifin+1,3) ) / u(k,j,ifin+1,0) ); } } } // Communicate in j direction. if (ind1 == 1) { exchange_6(phi1, jbeg, jfin1); } if (ind2 == 1) { exchange_6(phi2, jbeg, jfin1); } frc3 = 0.0e+00; for (k = ki1; k <= ki2 - 1; k++) { for (j = jbeg; j <= jfin1; j++) { frc3 = frc3 + ( phi1[k][j] + phi1[k][j+1] + phi1[k+1][j] + phi1[k+1][j+1] + phi2[k][j] + phi2[k][j+1] + phi2[k+1][j] + phi2[k+1][j+1] ); } } // Compute the global sum of individual contributions to frc3. dummy = frc3; MPI_Allreduce(&dummy, &frc3, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); frc3 = deta * dzeta * frc3; frc = 0.25e+00 * (frc1 + frc2 + frc3); if (id == 0) { printf(" Surface integral = %e.\n", frc); } // Free arrays. for (k = 0; k < (isiz3 + 2); k++) { free(phi1[k]); free(phi2[k]); } // Free phi1 and 2. free(phi1); free(phi2); } post_cl.c0000644000175600017620000000171511553632707011065 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" #include /** * Update the solution. */ void post_cl(double tmp, cl_mem u, cl_mem rsd) { cl_int status; // Set the arguments to the kernel. status = clSetKernelArg(post_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(post_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(post_kernel, 2, sizeof(cl_double), (void*) &tmp); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for post: "); // Call the kernel. size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; timer_start(6); status = clEnqueueNDRangeKernel(subQueue, post_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(6); wclCheckError(status, CL_SUCCESS, " Could not launch postprocessing kernel: "); } pre_cl.c0000644000175600017620000000145011553632707010662 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" #include /** * Carry out the SSOR iteration. */ void pre_cl(cl_mem rsd) { cl_int status; // Set the arguments to the kernel. status = clSetKernelArg(pre_kernel, 0, sizeof(cl_mem), (void*) &rsd); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for pre: "); // Call the kernel. size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; timer_start(1); status = clEnqueueNDRangeKernel(subQueue, pre_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(1); wclCheckError(status, CL_SUCCESS, " Could not launch preprocessing kernel: "); } proc_grid.c0000644000175600017620000000231311553632707011365 0ustar sjpsjp /** * subroutine proc_grid */ #include #include #include "applu.h" /** * implicit none * * include 'applu.incl' */ /** * local variables */ /** * set up a two-d grid for processors: column-major ordering of unknowns * NOTE: assumes a power-of-two number of processors */ /** * num - number of nodes(processors) * nodedim - a function such that it compute the exponent where num = 2**nodedim * i.e. ndim is the squar-root of num */ void proc_grid(){ // xdim = 2**(ndim/2) xdim = pow(2, (ndim/2)); // if (mod(ndim,2).eq.1) xdim = xdim + xdim if(ndim%2 == 1){ xdim = xdim + xdim; } // ydim = num/xdim ydim = num/xdim; // write( *, 200) ndim //200 format(' ndim ', i4) if (id == 0) { printf(" ndim %d\n", ndim); // write( *, 210) xdim //210 format(' xdim ', i4) printf(" xdim %d\n", xdim); // write( *, 201) ydim //201 format(' ydim ', i4) printf(" ydim %d\n", ydim); } // row = mod(id,xdim) + 1 row = (id%xdim) + 1; // col = id/xdim + 1 col = (id/xdim) + 1; //printf("Rank %d: Row = %d, Column = %d.\n", id, row, col); // return } read_input.c0000644000175600017620000000507411553632707011556 0ustar sjpsjp#include "mpinpb.h" #include "applu.h" #include /** * only root reads the input file * if input file does not exist, it uses defaults * ipr = 1 for detailed progress output * inorm = how often the norm is printed (once every inorm iterations) * itmax = number of pseudo time steps * dt = time step * omega 1 over-relaxation factor for SSOR * tolrsd = steady state residual tolerance levels * nx, ny, nz = number of grid points in x, y, z directions */ void read_input(){ int fstatus, nnodes; FILE *fp; char line[1024]; root = 0; if (id == root) { fp = fopen("inputlu.data2", "r"); if (fp != NULL) { printf(" Reading from input file inputlu.data\n"); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\n", &ipr, &inorm); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\n", &itmax); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\n", &dt); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\n", &omega); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%lf\t%lf\t%lf\t%lf\t%lf\n", &tolrsd[0], &tolrsd[1], &tolrsd[2], &tolrsd[3], &tolrsd[4]); fgets(line, 1000, fp); fgets(line, 1000, fp); fscanf(fp, "%d\t%d\t%d\n", &nx0, &ny0, &nz0); fclose(fp); } else { printf(" Using defaults.\n"); ipr = ipr_default; inorm = inorm_default; itmax = itmax_default; dt = dt_default; omega = omega_default; tolrsd[0] = tolrsd1_def; tolrsd[1] = tolrsd2_def; tolrsd[2] = tolrsd3_def; tolrsd[3] = tolrsd4_def; tolrsd[4] = tolrsd5_def; nx0 = isiz01; ny0 = isiz02; nz0 = isiz03; } MPI_Comm_size(MPI_COMM_WORLD, &nnodes); if (nnodes != nnodes_compiled) { printf(" Warning: program is running on %d processors but was compiled for %d\n", nnodes, nnodes_compiled); } if (nx0 < 4 || ny0 < 4 || nz0 < 4) { printf(" PROBLEM SIZE IS TOO SMALL - SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } if (nx0 > isiz01 || ny0 > isiz02 || nz0 > isiz03) { printf(" PROBLEM SIZE IS TOO LARGE - NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY\n"); MPI_Abort(MPI_COMM_WORLD, MPI_ERR_OTHER); } printf(" NAS Parallel Benchmarks 3.2 -- LU Benchmark\n"); printf(" University of ÌÇÐÄTV OpenCL Port\n"); printf(" Size: %d %d %d\n", nx0, ny0, nz0); printf(" Iterations: %d\n", itmax); printf(" Number of processors: %d\n", nnodes); } bcast_inputs(); } rearrangement_cl.c0000644000175600017620000001444411553632707012735 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" /** * Wrapper method for flat_to_hyperplane kernel. */ void flat_to_hyperplane(cl_mem flat, cl_mem hyperplane) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the flat array, storing the result in the rearrangement buffer. status = clSetKernelArg(flat_to_hyperplane_kernel, 0, sizeof(cl_mem), (void*) &flat); status |= clSetKernelArg(flat_to_hyperplane_kernel, 1, sizeof(cl_mem), (void*) &hyperplane); status |= clSetKernelArg(flat_to_hyperplane_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(flat_to_hyperplane_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(flat_to_hyperplane_kernel, 4, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for flat_to_hyperplane: "); status = clEnqueueNDRangeKernel(subQueue, flat_to_hyperplane_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch flat_to_hyperplane kernel: "); } /** * Wrapper method for hyperplane_to_flat kernel. */ void hyperplane_to_flat(cl_mem hyperplane, cl_mem flat) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the flat array, storing the result in the rearrangement buffer. status = clSetKernelArg(hyperplane_to_flat_kernel, 0, sizeof(cl_mem), (void*) &hyperplane); status |= clSetKernelArg(hyperplane_to_flat_kernel, 1, sizeof(cl_mem), (void*) &flat); status |= clSetKernelArg(hyperplane_to_flat_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(hyperplane_to_flat_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(hyperplane_to_flat_kernel, 4, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for hyperplane_to_flat: "); status = clEnqueueNDRangeKernel(subQueue, hyperplane_to_flat_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch hyperplane_to_flat kernel: "); } /** * Wrapper method for tiled_to_hyperplane kernel. */ void tiled_to_hyperplane(cl_mem tiled, cl_mem hyperplane) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the tiled array, storing the result in the rearrangement buffer. status = clSetKernelArg(tiled_to_hyperplane_kernel, 0, sizeof(cl_mem), (void*) &tiled); status |= clSetKernelArg(tiled_to_hyperplane_kernel, 1, sizeof(cl_mem), (void*) &hyperplane); status |= clSetKernelArg(tiled_to_hyperplane_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(tiled_to_hyperplane_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(tiled_to_hyperplane_kernel, 4, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for tiled_to_hyperplane: "); status = clEnqueueNDRangeKernel(subQueue, tiled_to_hyperplane_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch tiled_to_hyperplane kernel: "); } /** * Wrapper method for hyperplane_to_tiled kernel. */ void hyperplane_to_tiled(cl_mem hyperplane, cl_mem tiled) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the tiled array, storing the result in the rearrangement buffer. status = clSetKernelArg(hyperplane_to_tiled_kernel, 0, sizeof(cl_mem), (void*) &hyperplane); status |= clSetKernelArg(hyperplane_to_tiled_kernel, 1, sizeof(cl_mem), (void*) &tiled); status |= clSetKernelArg(hyperplane_to_tiled_kernel, 2, sizeof(cl_mem), (void*) &wavefront_offsets_2d_d); status |= clSetKernelArg(hyperplane_to_tiled_kernel, 3, sizeof(cl_mem), (void*) &wavefront_offsets_3d_d); status |= clSetKernelArg(hyperplane_to_tiled_kernel, 4, sizeof(cl_mem), (void*) &thread_mapping_d); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for hyperplane_to_tiled: "); status = clEnqueueNDRangeKernel(subQueue, hyperplane_to_tiled_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch hyperplane_to_tiled kernel: "); } /** * Wrapper method for flat_to_tiled kernel. */ void flat_to_tiled(cl_mem flat, cl_mem tiled) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the flat array, storing the result in the rearrangement buffer. status = clSetKernelArg(flat_to_tiled_kernel, 0, sizeof(cl_mem), (void*) &flat); status |= clSetKernelArg(flat_to_tiled_kernel, 1, sizeof(cl_mem), (void*) &tiled); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for flat_to_tiled: "); status = clEnqueueNDRangeKernel(subQueue, flat_to_tiled_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch flat_to_tiled kernel: "); } /** * Wrapper method for tiled_to_flat kernel. */ void tiled_to_flat(cl_mem tiled, cl_mem flat) { cl_int status; size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; // Transpose the flat array, storing the result in the rearrangement buffer. status = clSetKernelArg(tiled_to_flat_kernel, 0, sizeof(cl_mem), (void*) &tiled); status |= clSetKernelArg(tiled_to_flat_kernel, 1, sizeof(cl_mem), (void*) &flat); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for tiled_to_flat: "); status = clEnqueueNDRangeKernel(subQueue, tiled_to_flat_kernel, 3, NULL, global, local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Could not launch tiled_to_flat kernel: "); } rhs.c0000644000175600017620000003257111553632707010222 0ustar sjpsjp#include "applu.h" #include "util.h" /** * Compute the right hand sides. */ void rhs() { /** * Local variables. */ int i, j, k, m; int iex; int L1, L2; int ist1, iend1; int jst1, jend1; double q; double u21, u31, u41; double tmp; double u21i, u31i, u41i, u51i; double u21j, u31j, u41j, u51j; double u21k, u31k, u41k, u51k; double u21im1, u31im1, u41im1, u51im1; double u21jm1, u31jm1, u41jm1, u51jm1; double u21km1, u31km1, u41km1, u51km1; for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { for (i = 2; i <= nx + 1; i++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = -frct(k,j,i,m); } } } } // xi-direction flux differences. /** * iex = flag : iex = 0 north/south communication * : iex = 1 east/west communication */ iex = 0; // Communicate and receive/send two rows of data. // TODO: May need to pass this by reference... exchange_3(u, iex); L1 = 1; if (north == -1) { L1 = 2; } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (k = 1; k <= nz - 2; k++) { for (j = jst; j<= jend; j++) { for (i = L1; i <= L2; i++) { flux(k,j,i,0) = u(k,j,i,1); u21 = u(k,j,i,1) / u(k,j,i,0); q = 0.50e+00 * ( u(k,j,i,1) * u(k,j,i,1) + u(k,j,i,2) * u(k,j,i,2) + u(k,j,i,3) * u(k,j,i,3) ) / u(k,j,i,0); flux(k,j,i,1) = u(k,j,i,1) * u21 + c2 * ( u(k,j,i,4) - q ); flux(k,j,i,2) = u(k,j,i,2) * u21; flux(k,j,i,3) = u(k,j,i,3) * u21; flux(k,j,i,4) = ( c1 * u(k,j,i,4) - c2 * q ) * u21; } } } for (k = 1; k <= nz - 2; k++) { for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - tx2 * ( flux(k,j,i+1,m) - flux(k,j,i-1,m) ); } } L2 = nx + 2; if (south == -1) { L2 = nx + 1; } for (i = ist; i <= L2; i++) { tmp = 1.0e+00 / u(k,j,i,0); u21i = tmp * u(k,j,i,1); u31i = tmp * u(k,j,i,2); u41i = tmp * u(k,j,i,3); u51i = tmp * u(k,j,i,4); tmp = 1.0e+00 / u(k,j,i-1,0); u21im1 = tmp * u(k,j,i-1,1); u31im1 = tmp * u(k,j,i-1,2); u41im1 = tmp * u(k,j,i-1,3); u51im1 = tmp * u(k,j,i-1,4); flux(k,j,i,1) = ( 4.0e+00 / 3.0e+00 ) * tx3 * (u21i - u21im1); flux(k,j,i,2) = tx3 * ( u31i - u31im1 ); flux(k,j,i,3) = tx3 * ( u41i - u41im1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tx3 * ( ( u21i * u21i + u31i * u31i + u41i * u41i ) - ( u21im1 * u21im1 + u31im1 * u31im1 + u41im1 * u41im1 ) ) + (1.0e+00/6.0e+00) * tx3 * ( u21i * u21i - u21im1 * u21im1 ) + c1 * c5 * tx3 * ( u51i - u51im1 ); } for (i = ist; i <= iend; i++) { rsd(k,j,i,0) = rsd(k,j,i,0) + dx1 * tx1 * ( u(k,j,i-1,0) - 2.0e+00 * u(k,j,i,0) + u(k,j,i+1,0) ); rsd(k,j,i,1) = rsd(k,j,i,1) + tx3 * c3 * c4 * ( flux(k,j,i+1,1) - flux(k,j,i,1) ) + dx2 * tx1 * ( u(k,j,i-1,1) - 2.0e+00 * u(k,j,i,1) + u(k,j,i+1,1) ); rsd(k,j,i,2) = rsd(k,j,i,2) + tx3 * c3 * c4 * ( flux(k,j,i+1,2) - flux(k,j,i,2) ) + dx3 * tx1 * ( u(k,j,i-1,2) - 2.0e+00 * u(k,j,i,2) + u(k,j,i+1,2) ); rsd(k,j,i,3) = rsd(k,j,i,3) + tx3 * c3 * c4 * ( flux(k,j,i+1,3) - flux(k,j,i,3) ) + dx4 * tx1 * ( u(k,j,i-1,3) - 2.0e+00 * u(k,j,i,3) + u(k,j,i+1,3) ); rsd(k,j,i,4) = rsd(k,j,i,4) + tx3 * c3 * c4 * ( flux(k,j,i+1,4) - flux(k,j,i,4) ) + dx5 * tx1 * ( u(k,j,i-1,4) - 2.0e+00 * u(k,j,i,4) + u(k,j,i+1,4) ); } /** * Fourth-order dissipation. */ if (north == -1) { for (m = 0; m < 5; m++) { rsd(k,j,3,m) = rsd(k,j,3,m) - dssp * ( + 5.0e+00 * u(k,j,3,m) - 4.0e+00 * u(k,j,4,m) + u(k,j,5,m) ); rsd(k,j,4,m) = rsd(k,j,4,m) - dssp * ( - 4.0e+00 * u(k,j,3,m) + 6.0e+00 * u(k,j,4,m) - 4.0e+00 * u(k,j,5,m) + u(k,j,6,m) ); } } ist1 = 2; iend1 = nx + 1; if (north == -1) { ist1 = 5; } if (south == -1) { iend1 = nx - 2; } for (i = ist1; i <= iend1; i++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - dssp * ( u(k,j,i-2,m) - 4.0e+00 * u(k,j,i-1,m) + 6.0e+00 * u(k,j,i,m) - 4.0e+00 * u(k,j,i+1,m) + u(k,j,i+2,m) ); } } if (south == -1) { for (m = 0; m < 5; m++) { rsd(k,j,nx-1,m) = rsd(k,j,nx-1,m) - dssp * ( u(k,j,nx-3,m) - 4.0e+00 * u(k,j,nx-2,m) + 6.0e+00 * u(k,j,nx-1,m) - 4.0e+00 * u(k,j,nx,m) ); rsd(k,j,nx,m) = rsd(k,j,nx,m) - dssp * ( u(k,j,nx-2,m) - 4.0e+00 * u(k,j,nx-1,m) + 5.0e+00 * u(k,j,nx,m) ); } } } } /** * eta-direction flux differences. */ // iex = flag : iex = 0 north/south communication iex = 1; // Communicate and receive/send two rows of data. // TODO: May need to pass these by reference... exchange_3(u, iex); L1 = 1; if (west == -1) { L1 = 2; } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = L1; j <= L2; j++) { flux(k,j,i,0) = u(k,j,i,2); u31 = u(k,j,i,2) / u(k,j,i,0); q = 0.50e+00 * ( u(k,j,i,1) * u(k,j,i,1) + u(k,j,i,2) * u(k,j,i,2) + u(k,j,i,3) * u(k,j,i,3) ) / u(k,j,i,0); flux(k,j,i,1) = u(k,j,i,1) * u31; flux(k,j,i,2) = u(k,j,i,2) * u31 + c2 * ( u(k,j,i,4) - q ); flux(k,j,i,3) = u(k,j,i,3) * u31; flux(k,j,i,4) = ( c1 * u(k,j,i,4) - c2 * q ) * u31; } } } for (k = 1; k <= nz - 2; k++) { for (i = ist; i <= iend; i++) { for (j = jst; j <= jend; j++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - ty2 * ( flux(k,j+1,i,m) - flux(k,j-1,i,m) ); } } L2 = ny + 2; if (east == -1) { L2 = ny + 1; } for (j = jst; j <= L2; j++) { tmp = 1.0e+00 / u(k,j,i,0); u21j = tmp * u(k,j,i,1); u31j = tmp * u(k,j,i,2); u41j = tmp * u(k,j,i,3); u51j = tmp * u(k,j,i,4); tmp = 1.0e+00 / u(k,j-1,i,0); u21jm1 = tmp * u(k,j-1,i,1); u31jm1 = tmp * u(k,j-1,i,2); u41jm1 = tmp * u(k,j-1,i,3); u51jm1 = tmp * u(k,j-1,i,4); flux(k,j,i,1) = ty3 * ( u21j - u21jm1 ); flux(k,j,i,2) = (4.0e+00/3.0e+00) * ty3 * (u31j - u31jm1); flux(k,j,i,3) = ty3 * ( u41j - u41jm1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * ty3 * ( ( u21j * u21j + u31j * u31j + u41j * u41j ) - ( u21jm1 * u21jm1 + u31jm1 * u31jm1 + u41jm1 * u41jm1 ) ) + (1.0e+00/6.0e+00) * ty3 * ( u31j * u31j - u31jm1 * u31jm1 ) + c1 * c5 * ty3 * ( u51j - u51jm1 ); } for (j = jst; j <= jend; j++) { rsd(k,j,i,0) = rsd(k,j,i,0) + dy1 * ty1 * ( u(k,j-1,i,0) - 2.0e+00 * u(k,j,i,0) + u(k,j+1,i,0) ); rsd(k,j,i,1) = rsd(k,j,i,1) + ty3 * c3 * c4 * ( flux(k,j+1,i,1) - flux(k,j,i,1) ) + dy2 * ty1 * ( u(k,j-1,i,1) - 2.0e+00 * u(k,j,i,1) + u(k,j+1,i,1) ); rsd(k,j,i,2) = rsd(k,j,i,2) + ty3 * c3 * c4 * (flux(k,j+1,i,2) - flux(k,j,i,2) ) + dy3 * ty1 * ( u(k,j-1,i,2) - 2.0e+00 * u(k,j,i,2) + u(k,j+1,i,2) ); rsd(k,j,i,3) = rsd(k,j,i,3) + ty3 * c3 * c4 * (flux(k,j+1,i,3) - flux(k,j,i,3) ) + dy4 * ty1 * ( u(k,j-1,i,3) - 2.0e+00 * u(k,j,i,3) + u(k,j+1,i,3) ); rsd(k,j,i,4) = rsd(k,j,i,4) + ty3 * c3 * c4 * (flux(k,j+1,i,4) - flux(k,j,i,4) ) + dy5 * ty1 * ( u(k,j-1,i,4) - 2.0e+00 * u(k,j,i,4) + u(k,j+1,i,4) ); } /** * Fourth-order dissipation. */ if (west == -1) { for (m = 0; m < 5; m++) { rsd(k,3,i,m) = rsd(k,3,i,m) - dssp * ( + 5.0e+00 * u(k,3,i,m) - 4.0e+00 * u(k,4,i,m) + u(k,5,i,m) ); rsd(k,4,i,m) = rsd(k,4,i,m) - dssp * ( - 4.0e+00 * u(k,3,i,m) + 6.0e+00 * u(k,4,i,m) - 4.0e+00 * u(k,5,i,m) + u(k,6,i,m) ); } } jst1 = 2; jend1 = ny + 1; if (west == -1) { jst1 = 5; } if (east == -1) { jend1 = ny - 2; } for (j = jst1; j <= jend1; j++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - dssp * ( u(k,j-2,i,m) - 4.0e+00 * u(k,j-1,i,m) + 6.0e+00 * u(k,j,i,m) - 4.0e+00 * u(k,j+1,i,m) + u(k,j+2,i,m) ); } } if (east == -1) { for (m = 0; m < 5; m++) { rsd(k,ny-1,i,m) = rsd(k,ny-1,i,m) - dssp * ( u(k,ny-3,i,m) - 4.0e+00 * u(k,ny-2,i,m) + 6.0e+00 * u(k,ny-1,i,m) - 4.0e+00 * u(k,ny,i,m) ); rsd(k,ny,i,m) = rsd(k,ny,i,m) - dssp * ( u(k,ny-2,i,m) - 4.0e+00 * u(k,ny-1,i,m) + 5.0e+00 * u(k,ny,i,m) ); } } } } /** * zeta-direction flux differences. */ for (j = jst; j <= jend; j++) { for (i = ist; i <= iend; i++) { for (k = 0; k <= nz - 1; k++) { flux(k,j,i,0) = u(k,j,i,3); u41 = u(k,j,i,3) / u(k,j,i,0); q = 0.50e+00 * ( u(k,j,i,1) * u(k,j,i,1) + u(k,j,i,2) * u(k,j,i,2) + u(k,j,i,3) * u(k,j,i,3) ) / u(k,j,i,0); flux(k,j,i,1) = u(k,j,i,1) * u41; flux(k,j,i,2) = u(k,j,i,2) * u41; flux(k,j,i,3) = u(k,j,i,3) * u41 + c2 * ( u(k,j,i,4) - q ); flux(k,j,i,4) = ( c1 * u(k,j,i,4) - c2 * q ) * u41; } for (k = 1; k <= nz - 2; k++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - tz2 * ( flux(k+1,j,i,m) - flux(k-1,j,i,m) ); } } for (k = 1; k <= nz - 1; k++) { tmp = 1.0e+00 / u(k,j,i,0); u21k = tmp * u(k,j,i,1); u31k = tmp * u(k,j,i,2); u41k = tmp * u(k,j,i,3); u51k = tmp * u(k,j,i,4); tmp = 1.0e+00 / u(k-1,j,i,0); u21km1 = tmp * u(k-1,j,i,1); u31km1 = tmp * u(k-1,j,i,2); u41km1 = tmp * u(k-1,j,i,3); u51km1 = tmp * u(k-1,j,i,4); flux(k,j,i,1) = tz3 * ( u21k - u21km1 ); flux(k,j,i,2) = tz3 * ( u31k - u31km1 ); flux(k,j,i,3) = (4.0e+00/3.0e+00) * tz3 * ( u41k - u41km1 ); flux(k,j,i,4) = 0.50e+00 * ( 1.0e+00 - c1 * c5 ) * tz3 * ( ( u21k * u21k + u31k * u31k + u41k * u41k ) - ( u21km1 * u21km1 + u31km1 * u31km1 + u41km1 * u41km1 ) ) + (1.0e+00/6.0e+00) * tz3 * ( u41k * u41k - u41km1 * u41km1 ) + c1 * c5 * tz3 * ( u51k - u51km1 ); } for (k = 1; k <= nz - 2; k++) { rsd(k,j,i,0) = rsd(k,j,i,0) + dz1 * tz1 * ( u(k-1,j,i,0) - 2.0e+00 * u(k,j,i,0) + u(k+1,j,i,0) ); rsd(k,j,i,1) = rsd(k,j,i,1) + tz3 * c3 * c4 * ( flux(k+1,j,i,1) - flux(k,j,i,1) ) + dz2 * tz1 * ( u(k-1,j,i,1) - 2.0e+00 * u(k,j,i,1) + u(k+1,j,i,1) ); rsd(k,j,i,2) = rsd(k,j,i,2) + tz3 * c3 * c4 * ( flux(k+1,j,i,2) - flux(k,j,i,2) ) + dz3 * tz1 * ( u(k-1,j,i,2) - 2.0e+00 * u(k,j,i,2) + u(k+1,j,i,2) ); rsd(k,j,i,3) = rsd(k,j,i,3) + tz3 * c3 * c4 * ( flux(k+1,j,i,3) - flux(k,j,i,3) ) + dz4 * tz1 * ( u(k-1,j,i,3) - 2.0e+00 * u(k,j,i,3) + u(k+1,j,i,3) ); rsd(k,j,i,4) = rsd(k,j,i,4) + tz3 * c3 * c4 * ( flux(k+1,j,i,4) - flux(k,j,i,4) ) + dz5 * tz1 * ( u(k-1,j,i,4) - 2.0e+00 * u(k,j,i,4) + u(k+1,j,i,4) ); } /** * Fourth-order dissipation. */ for (m = 0; m < 5; m++) { rsd(1,j,i,m) = rsd(1,j,i,m) - dssp * ( + 5.0e+00 * u(1,j,i,m) - 4.0e+00 * u(2,j,i,m) + u(3,j,i,m) ); rsd(2,j,i,m) = rsd(2,j,i,m) - dssp * ( - 4.0e+00 * u(1,j,i,m) + 6.0e+00 * u(2,j,i,m) - 4.0e+00 * u(3,j,i,m) + u(4,j,i,m) ); } for (k = 3; k <= nz - 4; k++) { for (m = 0; m < 5; m++) { rsd(k,j,i,m) = rsd(k,j,i,m) - dssp * ( u(k-2,j,i,m) - 4.0e+00 * u(k-1,j,i,m) + 6.0e+00 * u(k,j,i,m) - 4.0e+00 * u(k+1,j,i,m) + u(k+2,j,i,m) ); } } for (m = 0; m < 5; m++) { rsd(nz-3,j,i,m) = rsd(nz-3,j,i,m) - dssp * ( u(nz-5,j,i,m) - 4.0e+00 * u(nz-4,j,i,m) + 6.0e+00 * u(nz-3,j,i,m) - 4.0e+00 * u(nz-2,j,i,m) ); rsd(nz-2,j,i,m) = rsd(nz-2,j,i,m) - dssp * ( u(nz-4,j,i,m) - 4.0e+00 * u(nz-3,j,i,m) + 5.0e+00 * u(nz-2,j,i,m) ); } } } } rhs_cl.c0000644000175600017620000002003511553632707010670 0ustar sjpsjp#include "size.h" #include "applu.h" #include "mpinpb.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" #include /** * Solution of the right-hand-side. * TODO: Need to put the MPI back in. */ void rhs_cl(cl_mem u, cl_mem rsd, cl_mem frct, cl_mem flux) { cl_int status; // Set the arguments to the kernels: // rhs_setup: status = clSetKernelArg(rhs_setup_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_setup_kernel, 1, sizeof(cl_mem), (void*) &frct); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_setup: "); // XI DIRECTION // rhs_xi1: status = clSetKernelArg(rhs_xi1_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_xi1_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi1: "); // rhs_xi2: status = clSetKernelArg(rhs_xi2_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_xi2_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi2: "); // rhs_xi3: status = clSetKernelArg(rhs_xi3_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_xi3_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi3: "); // rhs_xi4: status = clSetKernelArg(rhs_xi4_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_xi4_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_xi4_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi4: "); // rhs_xi_dissipation: status = clSetKernelArg(rhs_xi_dissipation_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_xi_dissipation_kernel, 1, sizeof(cl_mem), (void*) &rsd); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi_dissipation: "); // Single rhs_xi kernel. status = clSetKernelArg(rhs_xi_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_xi_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_xi_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_xi: "); // ETA DIRECTION // rhs_eta1: status = clSetKernelArg(rhs_eta1_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_eta1_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta1: "); // rhs_eta2: status = clSetKernelArg(rhs_eta2_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_eta2_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta2: "); // rhs_eta3: status = clSetKernelArg(rhs_eta3_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_eta3_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta3: "); // rhs_eta4: status = clSetKernelArg(rhs_eta4_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_eta4_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_eta4_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta4: "); // rhs_eta_dissipation: status = clSetKernelArg(rhs_eta_dissipation_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_eta_dissipation_kernel, 1, sizeof(cl_mem), (void*) &rsd); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta_dissipation: "); // Single rhs_eta kernel. status = clSetKernelArg(rhs_eta_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_eta_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_eta_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_eta: "); // ZETA DIRECTION // rhs_zeta1: status = clSetKernelArg(rhs_zeta1_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_zeta1_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta1: "); // rhs_zeta2: status = clSetKernelArg(rhs_zeta2_kernel, 0, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_zeta2_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta2: "); // rhs_zeta3: status = clSetKernelArg(rhs_zeta3_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_zeta3_kernel, 1, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta3: "); // rhs_zeta4: status = clSetKernelArg(rhs_zeta4_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_zeta4_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_zeta4_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta4: "); // rhs_zeta_dissipation: status = clSetKernelArg(rhs_zeta_dissipation_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_zeta_dissipation_kernel, 1, sizeof(cl_mem), (void*) &rsd); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta_dissipation: "); // Single rhs_zeta kernel. status = clSetKernelArg(rhs_zeta_kernel, 0, sizeof(cl_mem), (void*) &u); status |= clSetKernelArg(rhs_zeta_kernel, 1, sizeof(cl_mem), (void*) &rsd); status |= clSetKernelArg(rhs_zeta_kernel, 2, sizeof(cl_mem), (void*) &flux); wclCheckError(status, CL_SUCCESS, " Could not set kernel arguments for rhs_zeta: "); // Call the kernels size_t local[3] = {rhsblock[0], rhsblock[1], rhsblock[2]}; size_t global[3] = {rhsblock[0] * rhsgrid[0], rhsblock[1] * rhsgrid[1], rhsblock[2] * rhsgrid[2]}; timer_start(8); status = clEnqueueNDRangeKernel(subQueue, rhs_setup_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_pause(8); exchange_3_cl(0); timer_start(8); status = clEnqueueNDRangeKernel(subQueue, rhs_xi1_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_xi2_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_xi3_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_xi4_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_xi_dissipation_kernel, 3, NULL, global, local, 0, NULL, NULL); //status = clEnqueueNDRangeKernel(subQueue, rhs_xi_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_pause(8); exchange_3_cl(1); timer_start(8); status = clEnqueueNDRangeKernel(subQueue, rhs_eta1_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_eta2_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_eta3_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_eta4_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_eta_dissipation_kernel, 3, NULL, global, local, 0, NULL, NULL); //status = clEnqueueNDRangeKernel(subQueue, rhs_eta_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_zeta1_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_zeta2_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_zeta3_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_zeta4_kernel, 3, NULL, global, local, 0, NULL, NULL); status = clEnqueueNDRangeKernel(subQueue, rhs_zeta_dissipation_kernel, 3, NULL, global, local, 0, NULL, NULL); //status = clEnqueueNDRangeKernel(subQueue, rhs_zeta_kernel, 3, NULL, global, local, 0, NULL, NULL); timer_stop(8); wclCheckError(status, CL_SUCCESS, " Could not launch rhs kernels: "); } setbv.c0000644000175600017620000000252511553632707010545 0ustar sjpsjp// C port of NPB3.2 // subroutine setbv #include "applu.h" /** * Set the boundary values of dependent variables. */ void setbv() { /** * Local variables. */ int i, j, k; int iglob, jglob; /** * Set the dependent variable values along the top and bottom faces. */ for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, jglob, 0, &u(0,j,i,0)); exact(iglob, jglob, nz - 1, &u(nz-1,j,i,0)); } } /** * Set the dependent variable values along north and south faces. */ if (west == -1 ) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, 2, k, &u(k,2,i,0)); } } } if (east == -1) { for (k = 0; k <= nz - 1; k++) { for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; exact(iglob, ny0 + 1, k, &u(k,ny+1,i,0)); } } } /** * Set the dependent variable values along east and west faces. */ if (north == -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; exact(2, jglob, k, &u(k,j,2,0)); } } } if (south == -1) { for (k = 0; k <= nz - 1; k++) { for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; exact(nx0 + 1, jglob, k, &u(k,j,nx+1,0)); } } } } setcoeff.c0000644000175600017620000000534211553632707011220 0ustar sjpsjp// C port of NPB3.2 // subroutine setcoeff #include "applu.h" // Define max function. #define max2(x,y) ( (x > y) ? x : y ) #define max3(x,y,z) ( max2( max2(x, y), z ) ) /** * Setup coefficients. */ void setcoeff() { // Don't know how to group these coefficients. dxi = 1.0e+00 / ( nx0 - 1 ); deta = 1.0e+00 / ( ny0 - 1 ); dzeta = 1.0e+00 / ( nz0 - 1 ); tx1 = 1.0e+00 / ( dxi * dxi ); tx2 = 1.0e+00 / ( 2.0e+00 * dxi ); tx3 = 1.0e+00 / dxi; ty1 = 1.0e+00 / ( deta * deta ); ty2 = 1.0e+00 / ( 2.0e+00 * deta ); ty3 = 1.0e+00 / deta; tz1 = 1.0e+00 / ( dzeta * dzeta ); tz2 = 1.0e+00 / ( 2.0e+00 * dzeta ); tz3 = 1.0e+00 / dzeta; ii1 = 2; ii2 = nx0 - 1; ji1 = 2; ji2 = ny0 - 2; ki1 = 3; ki2 = nz0 - 1; // Diffusion coefficients. dx1 = 0.75e+00; dx2 = dx1; dx3 = dx1; dx4 = dx1; dx5 = dx1; dy1 = 0.75e+00; dy2 = dy1; dy3 = dy1; dy4 = dy1; dy5 = dy1; dz1 = 1.00e+00; dz2 = dz1; dz3 = dz1; dz4 = dz1; dz5 = dz1; // Fourth difference dissipation. dssp = ( max3 (dx1, dy1, dz1 ) ) / 4.0e+00; //dssp = 1.00e+00 / 4.0e+00; // We know that dz1 is the maximum. // Coefficients of the exact solution to the first pde. ce[0][0] = 2.0e+00; ce[1][0] = 0.0e+00; ce[2][0] = 0.0e+00; ce[3][0] = 4.0e+00; ce[4][0] = 5.0e+00; ce[5][0] = 3.0e+00; ce[6][0] = 5.0e-01; ce[7][0] = 2.0e-02; ce[8][0] = 1.0e-02; ce[9][0] = 3.0e-02; ce[10][0] = 5.0e-01; ce[11][0] = 4.0e-01; ce[12][0] = 3.0e-01; // Coefficients of the exact solution to the second pde. ce[0][1] = 1.0e+00; ce[1][1] = 0.0e+00; ce[2][1] = 0.0e+00; ce[3][1] = 0.0e+00; ce[4][1] = 1.0e+00; ce[5][1] = 2.0e+00; ce[6][1] = 3.0e+00; ce[7][1] = 1.0e-02; ce[8][1] = 3.0e-02; ce[9][1] = 2.0e-02; ce[10][1] = 4.0e-01; ce[11][1] = 3.0e-01; ce[12][1] = 5.0e-01; // Coefficients of the exact solution to the third pde. ce[0][2] = 2.0e+00; ce[1][2] = 2.0e+00; ce[2][2] = 0.0e+00; ce[3][2] = 0.0e+00; ce[4][2] = 0.0e+00; ce[5][2] = 2.0e+00; ce[6][2] = 3.0e+00; ce[7][2] = 4.0e-02; ce[8][2] = 3.0e-02; ce[9][2] = 5.0e-02; ce[10][2] = 3.0e-01; ce[11][2] = 5.0e-01; ce[12][2] = 4.0e-01; // Coefficients of the exact solution to the fourth pde. ce[0][3] = 2.0e+00; ce[1][3] = 2.0e+00; ce[2][3] = 0.0e+00; ce[3][3] = 0.0e+00; ce[4][3] = 0.0e+00; ce[5][3] = 2.0e+00; ce[6][3] = 3.0e+00; ce[7][3] = 3.0e-02; ce[8][3] = 5.0e-02; ce[9][3] = 4.0e-02; ce[10][3] = 2.0e-01; ce[11][3] = 1.0e-01; ce[12][3] = 3.0e-01; // Coefficients of the exact solution to the fifth pde. ce[0][4] = 5.0e+00; ce[1][4] = 4.0e+00; ce[2][4] = 3.0e+00; ce[3][4] = 2.0e+00; ce[4][4] = 1.0e-01; ce[5][4] = 4.0e-01; ce[6][4] = 3.0e-01; ce[7][4] = 5.0e-02; ce[8][4] = 4.0e-02; ce[9][4] = 3.0e-02; ce[10][4] = 1.0e-01; ce[11][4] = 3.0e-01; ce[12][4] = 2.0e-01; } sethyper.c0000644000175600017620000000313411553632707011262 0ustar sjpsjp// C port of NPB3.2 // subroutine sethyper #include "applu.h" /** * Fore each column in a hyperplane, istart = first row, */ void sethyper() { /** * Local variables. */ int i, j; int iglob, jglob; int kp; /** * Compute the pointers for hyperplanes. */ for (kp = 2; kp <= nx0 + ny0; kp++) { icomms[kp-1] = 0; icommn[kp-1] = 0; icomme[kp-1] = 0; icommw[kp-1] = 0; /** * Check to see if comm. to south is required. */ if (south != -1) { i = iend; iglob = ipt + i; jglob = kp - iglob; j = jglob - jpt; if (jglob >= 2 && jglob <= ny0 - 1 && j >= jst && j <= jend) { icomms[kp-1] = 1; } } /** * Check to see if comm. to north is required. */ if (north != -1) { i = ist; iglob = ipt + i; jglob = kp - iglob; j = jglob - jpt; if (jglob >= 2 && jglob <= ny0 - 1 && j >= jst && j <= jend) { icommn[kp-1] = 1; } } /** * Check to see if comm. to east is required. */ if (east != -1) { j = jend; jglob = jpt + j; iglob = kp - jglob; i = iglob - ipt; if (iglob >= 2 && iglob <= nx0 - 1 && i >= ist && i <= iend) { icomme[kp-1] = 1; } } /** * Check to see if comm. to west is required. */ if (west != -1) { j = jst; jglob = jpt + j; iglob = kp - jglob; i = iglob - ipt; if (iglob >= 2 && iglob <= nx0 - 1 && i >= ist && i <= iend) { icommw[kp-1] = 1; } } } icomms[0] = 0; icommn[0] = 0; icomme[0] = 0; icommw[0] = 0; icomms[nx0 + ny0 + 1] = 0; icommn[nx0 + ny0 + 1] = 0; icomme[nx0 + ny0 + 1] = 0; icommw[nx0 + ny0 + 1] = 0; } setiv.c0000644000175600017620000000310511553632707010547 0ustar sjpsjp// C port of NPB3.2 // subroutine setiv #include "applu.h" /** * Set the initial values of independent variables based on tri-linear * interpolation of boundary values in the computational space. */ void setiv() { /** * Local variables. */ int i, j, k, m; int iglob, jglob; double xi, eta, zeta; double pxi, peta, pzeta; double ue_1jk[5], ue_nx0jk[5], ue_i1k[5], ue_iny0k[5], ue_ij1[5], ue_ijnz[5]; for (k = 1; k <= nz - 2; k++) { zeta = ( (double) k ) / (nz - 1); for (j = 2; j <= ny + 1; j++) { jglob = jpt + j; if (jglob != 2 && jglob != ny0 + 1) { eta = ( (double) (jglob - 2) ) / (ny0 - 1); for (i = 2; i <= nx + 1; i++) { iglob = ipt + i; if (iglob != 2 && iglob != nx0 + 1) { xi = ( (double) (iglob - 2) ) / (nx0 - 1); // Find exact solutions for all i, j, k. exact(2, jglob, k, ue_1jk); exact(nx0 + 1, jglob, k, ue_nx0jk); exact(iglob, 2, k, ue_i1k); exact(iglob, ny0 + 1, k, ue_iny0k); exact(iglob, jglob, 0, ue_ij1); exact(iglob, jglob, nz - 1, ue_ijnz); for (m = 0; m < 5; m++) { pxi = (1.0e+00 - xi) * ue_1jk[m] + xi * ue_nx0jk[m]; peta = (1.0e+00 - eta) * ue_i1k[m] + eta * ue_iny0k[m]; pzeta = (1.0e+00 - zeta) * ue_ij1[m] + zeta * ue_ijnz[m]; u(k,j,i,m) = pxi + peta + pzeta - pxi * peta - peta * pzeta - pzeta * pxi + pxi * peta * pzeta; } } } } } } } setparams0000755000175600017620000003715311573566600011210 0ustar sjpsjpELF> @@˜&@8@ @@@@@ˆˆÈÈ@È@@@`` ````` ```````Påtd``@`@,,Qåtd/lib64/ld-linux-x86-64.so.2   r!(`év @S?@`¼" € @ @@ÄÖGH`ÿO0`a €@p(`ðã} € @Oƒ @èñÿp`‰ à @i8`$ñÿp`Üñÿp`è ¤@  ` @½* h@—ñÿlibm.so.6libmpi.so.0_fini_lib_version_initlibopen-rte.so.0libopen-pal.so.0libibverbs.so.1libtorque.so.2libnuma.so.1libdl.so.2libnsl.so.1libutil.so.1libpthread.so.0libc.so.6_mcount__register_atforkfopenfcloseatoifprintf__cxa_finalize__libc_start_main_edata__bss_start_end/opt/torque/2.5.5/lib__xargc__xargv__longdouble_used_IO_stdin_used__data_startilog2__fsripow2___Argv__fsr_init_valueGLIBC_2.3.2GLIBC_2.2.5²ri ¨ui ´À`à`è`ð`ø` ````UH‹ìSATAUAVAWPHƒÄA_A^A]A\[ÉÂÿ5> ÿ%@ @ÿ%> héàÿÿÿÿ%6 héÐÿÿÿÿ%. héÀÿÿÿÿ%& hé°ÿÿÿÿ% hé ÿÿÿÿ% héÿÿÿÿ% hé€ÿÿÿÿ% hépÿÿÿ3íH‰,% `X‰%@`H‹ÜH‰$%8`H‰$%H`HƒäðUPSR›è=AYZ^THÇÇ` @HÇÁh@IÇÀ@@èZÿÿÿôfD€Ã€„UH‹ìPSQRVW¸…À„¿‹ØÑëƒã¶“„@‹ØÁë ƒã3Ó‹Øóã Ó‹ØÁë‹ËÑá3Ùƒã‹ËÁá ÑHƒìÙ<$· $áÀð щ$Ù,$HƒÄƒâ?Áã ÓÁâ©tÊ@€3À¹@HìHƒäðH‹üóH«®$‹D$…Àu¸¿ÿ»ã Ó‹\$ã?ÿÿ Ó#ЉT$®T$H‹ÅHƒè0H‹à_^ZY[XÉÃUH‹ì…ÿ~@¹3À;Ït:ɃÀ;Ït1ɃÀ;Ït(ɃÀ;ÏtɃÀ;ÏtɃÀƒø|θÿÿÿÿë¸ÿÿÿÿÉÃf„@@UH‹ì…ÿ|Z…ÿtOwÿƒþ¸}€ÀNÿ‹Ö‹ñ…Òuóë8…ö}3ÿƒÿ|fDÁàƒÆøƒþ}õ…ö|ÀƒÆÿ…ö}÷ë ¸ë¸ÿÿÿÿÉÃf„€€UH‹ìSATAUAVAWHƒìH‹Þƒÿ…}H‹{H3ÀècýÿÿD‹àH‹CH‹{¾H3ÀèMýÿÿD‹èH=gH5XèýÿÿL‹ðM…ö„H5I‹þH3Àè9ýÿÿH5ÖI‹þH3Àè'ýÿÿH5¼I‹þH3ÀèýÿÿH5¢I‹þH3ÀèýÿÿƒûS„»ƒûW„šƒûAt}ƒûBt`ƒûCtCƒûDuHrH‰EÈA¿˜»,é™H=7‹óH3ÀèAüÿÿI‹þè™üÿÿ¸ÿÿÿÿé¡H<H‰EÈA¿¢»úë^H$H‰EÈA¿f»úëFH H‰EÈA¿@»úë.HüH‰EÈA¿!»,ëHìH‰EÈA¿ »2E…䎙¸3ÉA;Ät7ÀƒÁA;Ät-ÀƒÁA;Ät#ÀƒÁA;ÄtÀƒÁA;ÄtÀƒÁƒù|ɹÿÿÿÿA…ÉIÁD‹ÀAÑøA‹Èº3öA;ÔtCÒƒÆA;Ôt9ÒƒÆA;Ôt/ÒƒÆA;Ôt%ÒƒÆA;Ôt҃ƃþ|ɾÿÿÿÿë E3À3ɾÿÿÿÿCA@;ÖEÈ…É|R…ÉtGAÿƒø¾}öHÿ‹Ð‹Á…Òuóë7…À}3Ƀù|DÁæƒÀøƒø}õ…À|öƒÀÿ…À}÷ë ¾ë¾ÿÿÿÿE…À|VE…ÀtJ¿E…ÀtLA@ÿƒø}ÿHÿ‹Ð‹Á…Òuóë4…À}E3ÀAƒø|ÁçƒÀøƒø}õ…À|ÿƒÀÿ…À}÷ë ¿ë¿ÿÿÿÿA‡@Ž@ ÈÁét A‹Ç™÷þ‹Èë fA‹Çf™f÷þ¿ÈE‹ÇE…ÿxD;þ| A‹À™÷þD‹ÂA‹ÇA+ÀQA;ÇLʉMÀA‡@@ ÈÁét A‹Ç™÷ÿ‹Èë fA‹Çf™f÷ÿ¿ÈA‹÷E…ÿxD;ÿ|‹Æ™÷ÿ‹òA‹Ç+ÆQA;ÇLʉMÄH5aI‹þH3ÀèúÿÿH5/I‹þA‹ÔH3ÀèíùÿÿH5zI‹þH3ÀèÛùÿÿH5ðI‹þH3ÀèÉùÿÿH5ÆI‹þA‹×H3Àè´ùÿÿH5™I‹þA‹×H3ÀèŸùÿÿH5lI‹þA‹×H3ÀèŠùÿÿH5I‹þH3ÀèxùÿÿH5%I‹þH3ÀèfùÿÿH5ûI‹þ‹UÀH3ÀèQùÿÿH5ÎI‹þ‹UÄH3Àè<ùÿÿH5¡I‹þH3Àè*ùÿÿH5·I‹þH3ÀèùÿÿH5]I‹þH3ÀèùÿÿH5+I‹þ‹ÓH3ÀèòøÿÿH5÷I‹þ‹ÓH3ÀèÞøÿÿH5ËI‹þH‹UÈH3ÀèÈøÿÿH5UI‹þH3Àè¶øÿÿH5‹I‹þA‹ÕH3Àè¡øÿÿE;ï}LA‹ÏE…ÿxE;ý|‹Á™A÷ý‹ÊA‹×+Ñ…ÉuH53I‹þA‹×H3Àèiøÿÿë,H5AÕI‹þH3ÀèRøÿÿëH5I‹þA‹ÕH3Àè;øÿÿI‹þè#øÿÿ3Àë1H=H5%H3Àè©÷ÿÿ¸ÿÿÿÿëH=H3Àè“÷ÿÿ¸ÿÿÿÿHƒÄA_A^A]A\[ÉÃHù H…Àt H‹ í ét÷ÿÿ1Éë÷UH‹ìSATAUAVAWPHƒ=j t H‹=Á èx÷ÿÿHƒÄA_A^A]A\[ÉÂ?;/+ 73'#>:.* 62&"1.0e+002.0e+001.5e-030.5e+00#define problem_height %d #define kblock %d #define dt_default %s #define inorm_default %d #define itmax_default %d // Number of iterations, etc. #define isiz3 isiz03 #define isiz2 %d #define isiz1 %d // Array size per processor. #define isiz03 %d #define isiz02 %d #define isiz01 %d // Full array size. #define nnodes_compiled %d // Number of processors compiled for. Unknown class %c. */ * Define the problem and sub-problem sizes. /** Cannot open %s for writing. wsize.hUsage: ./setparams [number of processors] [problem class] [k-blocking] ;, öÿÿP€öÿÿx÷ÿÿ Àüÿÿèx   $$€ @OA†C $Là @iA†C ,t` @½A†C AƒBŒBBŽBzRx Ðûÿÿ 0ARbq~‰•¢²)) h@ @@è@`@¨@ À È`À¨@@ þÿÿo`@ÿÿÿoðÿÿo @``¢@²@Â@Ò@â@ò@ @ @ `À€ðÀ€ð@(#)crti.s 1.4 06/09/10 SMIas: Sun Compiler Common 12.2 Linux_i386 2010/08/13@(#)crt1.s 1.6 07/06/08 SMI@(#)fsrx.s 1.5 08/05/23 SMI@(#)values-Xa.c 6.1@(#)libc-port:gen/values-Xa.c 1.3@(#)synonyms.h 1.1acomp: Sun C 5.11 Linux_i386 2010/08/13iropt: Sun Compiler Common 12.2 Linux_i386 2010/08/13ir2hf: Sun Compiler Common 12.2 Linux_i386 2010/08/13ube: Sun Compiler (0x10|0x8|0x2) 12.2 Linux_i386 2010/08/13as: Sun Compiler Common 12.2 Linux_i386 2010/08/13@(#)stddef.h 1.18 04/09/28 SMI@(#)isa_defs.h 1.31 06/01/09 SMI@(#)stdarg.h 1.47 04/09/28 SMI@(#)va_impl.h 1.1 04/11/19 SMIacomp: Sun C 5.11 Linux_i386 2010/08/13iropt: Sun Compiler Common 12.2 Linux_i386 2010/08/13ir2hf: Sun Compiler Common 12.2 Linux_i386 2010/08/13ube: Sun Compiler (0x10|0x8|0x2) 12.2 Linux_i386 2010/08/13as: Sun Compiler Common 12.2 Linux_i386 2010/08/13ipo: Sun Compiler Common 12.2 Linux_i386 2010/08/13GCC: (Debian 4.4.5-10) 4.4.5@(#)crtn.s 1.2 05/07/29 SMIas: Sun Compiler Common 12.2 Linux_i386 2010/08/13,Õ @#Õ+œ__pthread_atforkÿÿÿÿavalues-Xa.c /net/spalko.sfbay/export/home1/train_builds/aten_Linux/100813.intel-Linux/lang/csu/amd64-Linux /net/spalko.sfbay/export/home1/train_builds/aten_Linux/100813.intel-Linux/boot3/opt/sun/sunstudioaten/prod/bin/cc -c -O -m64 -I. -DELF -m64 values-Xa.cXa;O;R=Sun C 5.11 Linux_i386 2010/08/13;backend;raw;cd;DBG_GEN 5.3.0ÿÿÿÿ\setparams.c /home/sjp/svn/trunk/gpu/opencl/lu /opt/sun/studio/ss12u2/SolarisStudio12.2-linux-x86-tar-ML/solstudio12.2/prod/bin/cc -O5 -native -xprefetch -xunroll=8 -xipo -xvector -I ./headers -I /opt/opencl/amd-app-2.4/include -DTIMING -lm -I/opt/mpi/openmpi/1.4.3/sunstudio12u2/include -mt -L/opt/torque/2.5.5/lib -Wl,--rpath -Wl,/opt/torque/2.5.5/lib -L/opt/mpi/openmpi/1.4.3/sunstudio12u2/lib -lmpi -lopen-rte -lopen-pal -libverbs -ltorque -lnuma -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl setparams.cXa;O;R=Sun C 5.11 Linux_i386 2010/08/13;backend;raw;cd;DBG_GEN 5.3.0”',½Ù @<@ëd£72fÏintm-ÉrŠšy4W @<@wî %5” ¶6”6 7”l jù D! î # î W)h …D‡D% …D‡D%% $ > $ > $ >   I' .? : ; ' I@  : ; I I ! 4: ; I? < &Iÿÿÿÿˆ~ÿ /net/spalko.sfbay/export/home1/train_builds/aten_Linux/100813.intel-Linux/lang/csu/amd64-Linuxvalues-Xa.cÿÿÿÿKAÿ /home/sjp/svn/trunk/gpu/opencl/lusetparams.c^Bû ../stdio-commonpthread_atfork.c_itoa.h  @7¹»long long intpthread_atfork.cchildpreparelong long unsigned int_itoa_upper_digits__dso_handleunsigned chardouble__pthread_atfork_Bool_itoa_lower_digitsshort unsigned intparentGNU C 4.4.5floatshort int/home/aurel32/eglibc/eglibc-2.11.2/nptlUUTTQQ.symtab.strtab.shstrtab.interp.hash.dynsym.dynstr.gnu.version.gnu.version_r.rela.dyn.rela.plt.init.text.fini.rodata.rodata1.eh_frame_hdr.eh_frame.dynamic.got.got.plt.data.comment.debug_aranges.debug_pubnames.debug_info.debug_abbrev.debug_line.debug_str.debug_locÈ@È#è@èÀ) ¨@¨¸1`@`À9ÿÿÿo @ :Fþÿÿo`@`0U@_¨@¨À ih@h!dŒ@Œo @  u@@@9{€@€(ƒ¨@¨¸Œ`@`,š@Ф````­À`À²È`ÈX» ` PÁpôÊd0Ù”'é»õ»!Ä"M0Ì#Í$¢o%%˜. P ¸8³È@è@¨@`@ @`@@¨@ h@ Œ@ @ @@ €@¨@`@@``À`È` `ñÿñÿñÿ „@ ñÿ,P`8T`DX`OP`Z ¤@iñÿO``Z ¨@uñÿ†@¡°@¼ø@×`@'ò @ ¨@(8@D @Uè@pÈ`†``£¨@¾@Ù@ôÀ@è@+¸@FX@b¨@}À@˜@H³ `À(@ÛÈ@÷ˆ@x@.@J°@.e @th`à@«@@Æd`ã@ÿà@@5``>!(`I] @Sd@`l" € @t @@z™¬H`´Ó0`å €@ô(`/ € @O5 @è;ñÿp`G à @iM8`Uñÿp`Znñÿp`uŠ ¤@— ` @½œ h@¢ñÿcrti.scrtn.sfsr.strap_tablevalues-Xa.c__nan_union__huge_valf__huge_valDdata.dataDrodata.rodatasetparams.cpthread_atfork.c.XB$BQABA2u7NEsy.main..L99.XB$BQABA2u7NEsy.main..L72.XB$BQABA2u7NEsy.main..L95.XB$BQABA2u7NEsy.main..L91.XB$BQABA2u7NEsy.main..L61.XB$BQABA2u7NEsy.main..L82.XB$BQABA2u7NEsy.main..L102__pthread_atfork.XB$BQABA2u7NEsy.main..L57_GLOBAL_OFFSET_TABLE_.XA$BQABA2u7NEsy.__nan_union.XB$BQABA2u7NEsy.main..L98.XB$BQABA2u7NEsy.main..L53.XB$BQABA2u7NEsy.main..L94.XB$BQABA2u7NEsy.main..L64.XB$BQABA2u7NEsy.main..L105.XB$BQABA2u7NEsy.main..L68.XB$BQABA2u7NEsy.main..L101.XB$BQABA2u7NEsy.main..L60.XB$BQABA2u7NEsy.main..L97.XB$BQABA2u7NEsy.main..L52__dso_handle.XB$BQABA2u7NEsy.main..L93.XB$BQABA2u7NEsy.main..L108.XB$BQABA2u7NEsy.main..L84.XB$BQABA2u7NEsy.main..L100.XB$BQABA2u7NEsy.main..L104.XB$BQABA2u7NEsy.main..L59pthread_atfork.XA$BQABA2u7NEsy.__huge_val.XB$BQABA2u7NEsy.main..L96.XB$BQABA2u7NEsy.main..L92.XA$BQABA2u7NEsy.__huge_valf.XB$BQABA2u7NEsy.main..L103.XB$BQABA2u7NEsy.main..L58.XB$BQABA2u7NEsy.main..L54_DYNAMICdata_startprintf@@GLIBC_2.2.5_start__xargc_mcount_fini__register_atfork@@GLIBC_2.3.2fopen@@GLIBC_2.2.5__xargv__libc_start_main@@GLIBC_2.2.5__longdouble_used_IO_stdin_used__data_start__cxa_finalize@@GLIBC_2.2.5atoi@@GLIBC_2.2.5ilog2__fsr__bss_startipow2___Argv_endfclose@@GLIBC_2.2.5_edatafprintf@@GLIBC_2.2.5_lib_versionmain_init__fsr_init_valuesetparams.c0000644000175600017620000000766011553632710011420 0ustar sjpsjp// C port of NPB3.2 #define VERSION "3.2" #define FILENAME "size.h" #include #include #define max(x,y) ((x > y) ? x : y) /** * Integer log base two. * Return error if argument isn't a power of two or is less than or equal to zero. */ int ilog2(int i) { int log2; int exp2 = 1; if (i <= 0) { return -1; } for (log2 = 0; log2 < 20; log2++) { if (exp2 == i) { return log2; } exp2 *= 2; } return -1; } /** * Integer power of two. */ int ipow2(int i) { int pow2 = 1; if (i < 0) { return -1; } if (i == 0) { return 1; } while (i--) { pow2 *= 2; } return pow2; } /** * Creates an appropriate size.h file for a given class and number of processors. */ int main(int argc, char* argv[]) { // Check we have the right number of command line arguments. if (argc != 4) { printf("Usage: ./setparams [number of processors] [problem class] [k-blocking]\n"); return -1; } // Define some variables for later. int nprocs, class, kblock; int isiz1, isiz2, itmax, inorm, problem_size; int xdiv, ydiv; char* dt_default; // Read the command line arguments. nprocs = atoi(argv[1]); class = *argv[2]; kblock = atoi(argv[3]); // Open a size.h file for writing. FILE* file; file = fopen(FILENAME, "w"); if (!file) { printf("Cannot open %s for writing.\n", FILENAME); return -1; } // Write the header. fprintf(file, "/**\n"); fprintf(file, " * Define the problem and sub-problem sizes.\n"); fprintf(file, " */\n"); fprintf(file, "\n"); // Set the defaults for a given class. if (class == 'S') { problem_size = 12; dt_default = "0.5e+00"; itmax = 50; } else if (class == 'W') { problem_size = 33; dt_default = "1.5e-03"; itmax = 300; } else if (class == 'A') { problem_size = 64; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'B') { problem_size = 102; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'C') { problem_size = 162; dt_default = "2.0e+00"; itmax = 250; } else if (class == 'D') { problem_size = 408; dt_default = "1.0e+00"; itmax = 300; } else { printf("Unknown class %c.\n", class); fclose(file); return -1; } // Calculate the sub-problem size. inorm = itmax; xdiv = ydiv = ilog2(nprocs) / 2; if (xdiv + ydiv != ilog2(nprocs)) { xdiv += 1; } xdiv = ipow2(xdiv); ydiv = ipow2(ydiv); isiz1 = problem_size / xdiv; if (isiz1 * xdiv < problem_size) { isiz1++; } isiz2 = problem_size / ydiv; if (isiz2 * ydiv < problem_size) { isiz2++; } // Write the number of processors. fprintf(file, "// Number of processors compiled for.\n"); fprintf(file, "#define nnodes_compiled %d\n", nprocs); fprintf(file, "\n"); // Write the full array size. fprintf(file, "// Full array size.\n"); fprintf(file, "#define isiz01 %d\n", problem_size); fprintf(file, "#define isiz02 %d\n", problem_size); fprintf(file, "#define isiz03 %d\n", problem_size); fprintf(file, "\n"); // Write the array size per processor. fprintf(file, "// Array size per processor.\n"); fprintf(file, "#define isiz1 %d\n", isiz1); fprintf(file, "#define isiz2 %d\n", isiz2); fprintf(file, "#define isiz3 isiz03\n"); fprintf(file, "\n"); // Write the other information. fprintf(file, "// Number of iterations, etc.\n"); fprintf(file, "#define itmax_default %d\n", itmax); fprintf(file, "#define inorm_default %d\n", inorm); fprintf(file, "#define dt_default %s\n", dt_default); fprintf(file, "\n"); // Write the kblocking and gpu information. fprintf(file, "#define kblock %d\n", kblock); // Defining problem height as the amount of memory that should be allocated in z direction for ursd. if (kblock >= problem_size) { fprintf(file, "#define problem_height %d\n", kblock); } else if ( (problem_size % kblock) != 0 ) { fprintf(file, "#define problem_height %d\n", ((problem_size / kblock) * kblock) + kblock); } else { fprintf(file, "#define problem_height %d\n", problem_size); } // Close the file and exit cleanly. fclose(file); return 0; } ssor_cl.c0000644000175600017620000003767011564202621011065 0ustar sjpsjp#include #include #include #include "mpinpb.h" #include "applu.h" #include "wcl.h" #include "applu_cl.h" #include "timers.h" // Kernels. cl_kernel blts_kernel; cl_kernel buts_kernel; cl_kernel l2norm_kernel; cl_kernel pre_kernel; cl_kernel post_kernel; cl_kernel rhs_setup_kernel; cl_kernel rhs_xi_kernel; cl_kernel rhs_eta_kernel; cl_kernel rhs_zeta_kernel; cl_kernel rhs_xi1_kernel; cl_kernel rhs_xi2_kernel; cl_kernel rhs_xi3_kernel; cl_kernel rhs_xi4_kernel; cl_kernel rhs_xi_dissipation_kernel; cl_kernel rhs_eta1_kernel; cl_kernel rhs_eta2_kernel; cl_kernel rhs_eta3_kernel; cl_kernel rhs_eta4_kernel; cl_kernel rhs_eta_dissipation_kernel; cl_kernel rhs_zeta1_kernel; cl_kernel rhs_zeta2_kernel; cl_kernel rhs_zeta3_kernel; cl_kernel rhs_zeta4_kernel; cl_kernel rhs_zeta_dissipation_kernel; cl_kernel flat_to_hyperplane_kernel; cl_kernel hyperplane_to_flat_kernel; cl_kernel flat_to_tiled_kernel; cl_kernel tiled_to_flat_kernel; cl_kernel tiled_to_hyperplane_kernel; cl_kernel hyperplane_to_tiled_kernel; cl_kernel memset_double_kernel; cl_kernel print_mem_kernel; cl_kernel ex1_unpack_north_kernel; cl_kernel ex1_unpack_west_kernel; cl_kernel ex1_unpack_south_kernel; cl_kernel ex1_unpack_east_kernel; cl_kernel ex1_pack_north_kernel; cl_kernel ex1_pack_west_kernel; cl_kernel ex1_pack_south_kernel; cl_kernel ex1_pack_east_kernel; cl_kernel ex3_unpack_north_kernel; cl_kernel ex3_unpack_west_kernel; cl_kernel ex3_unpack_south_kernel; cl_kernel ex3_unpack_east_kernel; cl_kernel ex3_pack_north_kernel; cl_kernel ex3_pack_west_kernel; cl_kernel ex3_pack_south_kernel; cl_kernel ex3_pack_east_kernel; // Buffers. cl_mem u_d; cl_mem rsd_d; cl_mem frct_d; cl_mem flux_d; cl_mem sum_d; cl_mem wavefront_offsets_2d_d; cl_mem wavefront_offsets_3d_d; cl_mem columns_d; cl_mem rows_d; cl_mem thread_mapping_d; cl_mem rearrangement_d; cl_mem ibuffer_d; cl_mem jbuffer_d; cl_mem buf_d; cl_mem buf1_d; // Fission stuff. cl_device_id subDevice; cl_context subContext; cl_command_queue subQueue; cl_device_id* subDevices; cl_command_queue* subCommands; // Init extension function pointers #define INIT_CL_EXT_FCN_PTR(name) \ if(!pfn_##name) { \ pfn_##name = (name##_fn) clGetExtensionFunctionAddress(#name); \ if(!pfn_##name) { \ fprintf(stderr, "Something went wrong with the function pointer."); \ } \ } static clCreateSubDevicesEXT_fn pfn_clCreateSubDevicesEXT = NULL; /** * Performs pseudo-time stepping SSOR iterations for given nonlinear pde's. * Carries out work (jacld, blts, jacu, buts) on OpenCL-enabled devices. */ void ssor_cl(int niter) { cl_int status; // If this is the first time this function is called, then do some OpenCL setup. if (niter == 1) { /** * Initialise the OpenCL platform and WCL library. */ const char* platform_name; if (opt_platform == APPLU_PLATFORM_AMD) { platform_name = "Advanced Micro Devices, Inc."; } else if (opt_platform == APPLU_PLATFORM_NVIDIA) { platform_name = "NVIDIA Corporation"; } else if (opt_platform == APPLU_PLATFORM_INTEL) { platform_name = "Intel(R) Corporation"; } else { fprintf(stderr, " Unrecognised platform.\n"); exit(EXIT_FAILURE); } cl_device_type device_type; if (opt_device == APPLU_DEVICE_CPU) { device_type = CL_DEVICE_TYPE_CPU; } else if (opt_device == APPLU_DEVICE_GPU) { device_type = CL_DEVICE_TYPE_GPU; } else { fprintf(stderr, " Unrecognised device type.\n"); exit(EXIT_FAILURE); } // Initialise the platform. wclGetPlatform(platform_name); wclInit(device_type); // Split the device into several devices (if possible). if (opt_fission == APPLU_FISSION_ON) { // Set up the function pointer. INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); // Query the number of devices available. cl_device_partition_property_ext subDeviceProperties[] = { CL_DEVICE_PARTITION_EQUALLY_EXT, 1, CL_PROPERTIES_LIST_END_EXT, 0 }; cl_uint devices; pfn_clCreateSubDevicesEXT(wclDevice, subDeviceProperties, 0, NULL, &devices); printf(" Fissioning device into %d sub-devices.\n", (int) devices); // Fission the device. subDevices = (cl_device_id*) malloc( devices * sizeof(cl_device_id) ); pfn_clCreateSubDevicesEXT(wclDevice, subDeviceProperties, devices, subDevices, NULL); // Create a subcontext and subCommands. cl_context_properties properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) wclPlatform, 0 }; subContext = clCreateContext(properties, 4, subDevices, NULL, NULL, &status); wclCheckError(status, CL_SUCCESS, " Error: Failed to create the subcontext.\n"); subCommands = (cl_command_queue*) malloc( devices * sizeof(cl_command_queue) ); int i; for (i = 0; i < devices; i++) { subCommands[i] = clCreateCommandQueue(subContext, subDevices[i], 0, &status); } wclCheckError(status, CL_SUCCESS, " Error: Failed to create the subqueues.\n"); // Choose a subdevice and command queue according to MPI rank. subDevice = subDevices[id]; subQueue = subCommands[id]; } else if (opt_fission == APPLU_FISSION_OFF) { subDevice = wclDevice; subQueue = wclCommands; subContext = wclContext; } // Work-group size is dependent on SIMD width... if (opt_device == APPLU_DEVICE_CPU) { rhsblock[0] = 1; // TODO: May need to change this number for Intel SDK. rhsblock[1] = 1; rhsblock[2] = 1; waveblock[0] = 1; waveblock[1] = 1; waveblock[2] = 1; ex1iblock[0] = 1; ex1iblock[1] = 1; ex1iblock[2] = 1; ex1jblock[0] = 1; ex1jblock[1] = 1; ex1jblock[2] = 1; ex3iblock[0] = 1; ex3iblock[1] = 1; ex3iblock[2] = 1; ex3jblock[0] = 1; ex3jblock[1] = 1; ex3jblock[2] = 1; } else if (opt_device == APPLU_DEVICE_GPU) { rhsblock[0] = 8; rhsblock[1] = 8; rhsblock[2] = 1; waveblock[0] = 64; waveblock[1] = 1; waveblock[2] = 1; ex1iblock[0] = 64; ex1iblock[1] = 1; ex1iblock[2] = 1; ex1jblock[0] = 64; ex1jblock[1] = 1; ex1jblock[2] = 1; ex3iblock[0] = 64; ex3iblock[1] = 1; ex3iblock[2] = 1; ex3jblock[0] = 64; ex3jblock[1] = 1; ex3jblock[2] = 1; } // Set the work-group distribution. if (opt_distribution == APPLU_DISTRIBUTION_FINE) { rhsgrid[0] = ceil((isiz1 + 4)/ (double) rhsblock[0]); rhsgrid[1] = ceil((isiz2 + 4)/ (double) rhsblock[1]); rhsgrid[2] = isiz3; wavegrid[0] = ceil( ((isiz1 + 4) * (isiz2 + 4)) / (double) waveblock[0] ); wavegrid[1] = 1; wavegrid[2] = 1; ex1igrid[0] = ceil( (iend - ist + 1) / (double) ex1iblock[0] ); ex1igrid[1] = kblock; ex1igrid[2] = 1; ex1jgrid[0] = ceil( (jend - jst + 1) / (double) ex1jblock[0] ); ex1jgrid[1] = kblock; ex1jgrid[2] = 1; ex3igrid[0] = ceil( ny / (double) ex3iblock[0] ); ex3igrid[1] = nz; ex3igrid[2] = 1; ex3jgrid[0] = ceil( nx / (double) ex3jblock[0] ); ex3jgrid[1] = nz; ex3jgrid[2] = 1; } else if (opt_distribution == APPLU_DISTRIBUTION_COARSE) { // Get the number of compute units. size_t comp_units; status = clGetDeviceInfo(subDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &comp_units, NULL); // Note that this might not be enough for the GPU to hide latency -- but it makes the point. rhsgrid[0] = 1; rhsgrid[1] = 1; rhsgrid[2] = comp_units; wavegrid[0] = comp_units; wavegrid[1] = 1; wavegrid[2] = 1; ex1igrid[0] = 1; ex1igrid[1] = comp_units; ex1igrid[2] = 1; ex1jgrid[0] = 1; ex1jgrid[1] = comp_units; ex1jgrid[2] = 1; ex3igrid[0] = 1; ex3igrid[1] = comp_units; ex3igrid[2] = 1; ex3jgrid[0] = 1; ex3jgrid[1] = comp_units; ex3jgrid[2] = 1; } else { fprintf(stderr, "Error: Failed to set the work-group distribution.\n"); exit(EXIT_FAILURE); } if (id == 0) { printf(" EP Block: %d x %d x %d\n", rhsblock[0], rhsblock[1], rhsblock[2]); printf(" EP Grid: %d x %d x %d\n", rhsgrid[0], rhsgrid[1], rhsgrid[2]); printf(" W Block: %d x %d x %d\n", waveblock[0], waveblock[1], waveblock[2]); printf(" W Grid: %d x %d x %d\n", wavegrid[0], wavegrid[1], wavegrid[2]); } // Create the device allocate_cl_buffers(); // Load the program source and build the executables. build_kernels(); } /** * Create the lookup tables. */ int lookup_height = 0; if (opt_blocking == APPLU_BLOCKING_OLD) { lookup_height = kblock; } else if (opt_blocking == APPLU_BLOCKING_NEW) { lookup_height = isiz3; } int* wavefront_offsets_2d = (int*) malloc( ((isiz2 + 4) + (isiz1 + 4)) * sizeof(int) ); int* wavefront_offsets_3d = (int*) malloc( ((isiz2 + 4) + (isiz1 + 4) + lookup_height) * sizeof(int) ); int* columns = (int*) malloc( (isiz2 + 4) * (isiz1 + 4) * sizeof(int) ); int* rows = (int*) malloc( (isiz2 + 4) * (isiz1 + 4) * sizeof(int) ); int* thread_mapping = (int*) malloc( (isiz2 + 4) * (isiz1 + 4) * sizeof(int) ); if (!wavefront_offsets_2d || !wavefront_offsets_3d || !columns || !rows || !thread_mapping) { fprintf(stderr, " Unable to allocate memory for lookup tables.\n"); exit(EXIT_FAILURE); } prepare_lookup_tables(wavefront_offsets_2d, wavefront_offsets_3d, columns, rows, thread_mapping); /** * Create ibuffer and jbuffer for ex1. */ ibuf = (double*) malloc( kblock * 5 * (iend - ist + 1) * sizeof(double) ); jbuf = (double*) malloc( kblock * 5 * (jend - jst + 1) * sizeof(double) ); int istep; double tmp; double delunm[5]; root = 0; // Begin pseudo-time stepping iterations. tmp = 1.0e+00 / ( omega * ( 2.0e+00 - omega ) ); // Move all of the data to the device. clEnqueueWriteBuffer(subQueue, u_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), u, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, rsd_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), rsd, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, frct_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), frct, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, flux_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), flux, 0, NULL, NULL); // Tile the data. flat_to_tiled(u_d, rearrangement_d); swap_pointers(&u_d, &rearrangement_d); flat_to_tiled(rsd_d, rearrangement_d); swap_pointers(&rsd_d, &rearrangement_d); flat_to_tiled(frct_d, rearrangement_d); swap_pointers(&frct_d, &rearrangement_d); flat_to_tiled(flux_d, rearrangement_d); swap_pointers(&flux_d, &rearrangement_d); // Compute the steady-state residuals. rhs_cl(u_d, rsd_d, frct_d, flux_d); // Compute the L2 norms of newton iteration residuals. l2norm_cl(nz0, rsdnm, rsd_d, sum_d); MPI_Barrier(MPI_COMM_WORLD); timer_reset(); timer_start(0); #ifndef TIMING double wall_start, cpu_start; double wall_end, cpu_end; timer(&cpu_start, &wall_start); #endif // Start the timestep loop - do for niter times. for (istep = 1; istep <= niter; istep++) { if (id == 0) { if (istep % 20 == 0 || istep == itmax || istep == 1) { if (niter > 1) { printf(" Pseudo-time SSOR iteration no. = %d.\n", istep); } } } // Perform SSOR iteration. pre_cl(rsd_d); // Move to the hyperplane layout. timer_start(9); tiled_to_hyperplane(u_d, rearrangement_d); swap_pointers(&u_d, &rearrangement_d); tiled_to_hyperplane(rsd_d, rearrangement_d); swap_pointers(&rsd_d, &rearrangement_d); timer_stop(9); // For each tile in k-dimension (from bottom to top). int k; if (opt_blocking == APPLU_BLOCKING_OLD) { for (k = 0; k < nz; k += kblock) { // Perform the lower triangular solution. exchange_1_cl(k, 0); blts_cl(k, rsd_d, u_d, wavefront_offsets_2d_d, wavefront_offsets_3d_d, columns_d, rows_d, thread_mapping_d); exchange_1_cl(k + kblock - 1, 2); } } else if (opt_blocking == APPLU_BLOCKING_NEW) { blts_cl_new(rsd_d, u_d, wavefront_offsets_2d_d, wavefront_offsets_3d_d, columns_d, rows_d, thread_mapping_d); } // For each tile in k-dimension (from top to bottom). if (opt_blocking == APPLU_BLOCKING_OLD) { for (k = nz - 1; k >= 0; k -= kblock) { // Perform the upper triangular solution. exchange_1_cl(k, 1); buts_cl(k, rsd_d, u_d, wavefront_offsets_2d_d, wavefront_offsets_3d_d, columns_d, rows_d, thread_mapping_d); exchange_1_cl(k - kblock + 1, 3); } } else if (opt_blocking == APPLU_BLOCKING_NEW) { buts_cl_new(rsd_d, u_d, wavefront_offsets_2d_d, wavefront_offsets_3d_d, columns_d, rows_d, thread_mapping_d); } // Move to the tiled memory layout. timer_start(9); hyperplane_to_tiled(u_d, rearrangement_d); swap_pointers(&u_d, &rearrangement_d); hyperplane_to_tiled(rsd_d, rearrangement_d); swap_pointers(&rsd_d, &rearrangement_d); timer_stop(9); // Update the variables. post_cl(tmp, u_d, rsd_d); // Compute the max-norms of newton iteration corrections. if (istep % inorm == 0) { l2norm_cl(nz0, delunm, rsd_d, sum_d); if (ipr == 1 && id == 0) { printf(" RMS-norm of SSOR-iteration correction for first pde = %e.\n", delunm[0]); printf(" RMS-norm of SSOR-iteration correction for second pde = %e.\n", delunm[1]); printf(" RMS-norm of SSOR-iteration correction for third pde = %e.\n", delunm[2]); printf(" RMS-norm of SSOR-iteration correction for fourth pde = %e.\n", delunm[3]); printf(" RMS-norm of SSOR-iteration correction for fifth pde = %e.\n", delunm[4]); } else if (ipr == 2 && id == 0) { printf(" %d, %f.\n", istep, delunm[4]); } } // Compute the steady-state residuals. rhs_cl(u_d, rsd_d, frct_d, flux_d); // Compute the max-norms of newton iteration residuals. if (istep % inorm == 0 || istep == itmax ) { l2norm_cl(nz0, rsdnm, rsd_d, sum_d); if (ipr == 1 && id == 0) { printf(" RMS-norm of steady state residual for first pde = %e.\n", rsdnm[0]); printf(" RMS-norm of steady state residual for second pde = %e.\n", rsdnm[1]); printf(" RMS-norm of steady state residual for third pde = %e.\n", rsdnm[2]); printf(" RMS-norm of steady state residual for fourth pde = %e.\n", rsdnm[3]); printf(" RMS-norm of steady state residual for fifth pde = %e.\n", rsdnm[4]); } } // Check the newton-iteration residuals against the tolerance levels. if (rsdnm[0] < tolrsd[0] && rsdnm[1] < tolrsd[1] && rsdnm[2] < tolrsd[2] && rsdnm[3] < tolrsd[3] && rsdnm[4] < tolrsd[4]) { if (ipr == 1 && id == 0) { printf(" Convergence was achieved after %d pseudo-time steps.\n", istep); return; } } } // End the timers. timer_stop(0); #ifndef TIMING clFinish(subQueue); timer(&cpu_end, &wall_end); cputime = cpu_end - cpu_start; walltime = wall_end - wall_start; #endif // Pull all of the data back. tiled_to_flat(u_d, rearrangement_d); swap_pointers(&u_d, &rearrangement_d); tiled_to_flat(rsd_d, rearrangement_d); swap_pointers(&rsd_d, &rearrangement_d); tiled_to_flat(frct_d, rearrangement_d); swap_pointers(&frct_d, &rearrangement_d); tiled_to_flat(flux_d, rearrangement_d); swap_pointers(&flux_d, &rearrangement_d); clEnqueueReadBuffer(subQueue, u_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), u, 0, NULL, NULL); clEnqueueReadBuffer(subQueue, rsd_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), rsd, 0, NULL, NULL); clEnqueueReadBuffer(subQueue, frct_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), frct, 0, NULL, NULL); clEnqueueReadBuffer(subQueue, flux_d, CL_TRUE, 0, isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double), flux, 0, NULL, NULL); // Free the host memory. free(wavefront_offsets_2d); free(wavefront_offsets_3d); free(columns); free(rows); free(thread_mapping); free(ibuf); free(jbuf); // If this is the last time we expect to call SSOR, then tidy up WCL. if (niter != 1) { // Free the device free_cl_buffers(); free_kernels(); if (opt_fission == APPLU_FISSION_ON) { free(subDevices); free(subCommands); } // Cleanup OpenCL. wclCleanup(); } } subdomain.c0000644000175600017620000000333411553632710011374 0ustar sjpsjp// C port of NPB3.2 // subroutine subdomain #include "applu.h" #include "mpinpb.h" #include /** * Set up the sub-domain sizes. */ void subdomain() { /** * Local variables. */ int mm, ierror, errorcode; // x dimension mm = nx0 % xdim; if (row <= mm) { nx = nx0 / xdim + 1; ipt = (row - 1) * nx; } else { nx = nx0 / xdim; ipt = (row - 1) * nx + mm; } // y dimension mm = ny0 % ydim; if (col <= mm) { ny = ny0 / ydim + 1; jpt = (col - 1) * ny; } else { ny = ny0 / ydim; jpt = (col - 1) * ny + mm; } // z dimension nz = nz0; // Check the sub-domain size. if (nx < 4 || ny < 4 || nz < 4 ) { printf(" Subdomain size is too small - adjust problem size or number of processors so that "); printf("nx, ny and nz are greater than or equal to 4. They are currently %d, %d, %d.\n", nx, ny, nz); MPI_Abort(MPI_COMM_WORLD, 1); } if (nx > isiz1 || ny > isiz2 || nz > isiz3) { printf(" Subdomain size is too large - adjust problem size or number of processors so that "); printf("nx, ny and nz are less than or equal to isiz1, isiz2 and isiz3 respectively. They are "); printf("currently %d, %d, %d.\n", nx, ny, nz); MPI_Abort(MPI_COMM_WORLD, 1); } if (id == 0) { printf(" Subdomain: %d x %d x %d.\n", nx, ny, nz); } // Set up the start and end in i and j extents for all processors. // Originals: /*ist = 1; iend = nx; if (north == -1) { ist = 2; } if (south == -1) { iend = nx - 1; } jst = 1; jend = ny; if (west == -1) { jst = 2; } if (east == -1) { jend = ny - 1; }*/ // New versions. ist = 2; iend = nx + 1; if (north == -1) { ist = 3; } if (south == -1) { iend = nx; } jst = 2; jend = ny + 1; if (west == -1) { jst = 3; } if (east == -1) { jend = ny; } } timers.c0000644000175600017620000000765211553633122010723 0ustar sjpsjp#include "timers.h" #include "wcl.h" #include #include #include #include #include // Array of timers. Timer* timers; int no_timers; extern cl_command_queue subQueue; /** * Return the current cpu and wall time. */ void timer(double* cpu, double* et) { struct rusage r; struct timeval t; getrusage( RUSAGE_SELF, &r ); *cpu = r.ru_utime.tv_sec + r.ru_utime.tv_usec*1.0e-6; gettimeofday( &t, (struct timezone *)0 ); *et = t.tv_sec + t.tv_usec*1.0e-6; } // If we are timing, then the functions have content. #ifdef TIMING /** * Create the timers. */ void timer_init(int number) { // Allocate an array of timers. timers = (Timer*) malloc(number * sizeof(Timer)); no_timers = number; // Initialise them all to 0. int i; for (i = 0; i < no_timers; i++) { timer_reset(i); } } /** * Reset all timers. */ void timer_reset() { int i; for (i = 0; i < no_timers; i++) { timers[i].cpu_start = 0; timers[i].cpu_end = 0; timers[i].wall_start = 0; timers[i].wall_end = 0; timers[i].cpu_total = 0; timers[i].wall_total = 0; timers[i].calls = 0; } } /** * Start a timer. */ void timer_start(int i) { // Make sure the queue is synchronised. clFinish(subQueue); timer(&timers[i].cpu_start, &timers[i].wall_start); } /** * Stop a timer. */ void timer_stop(int i) { // Make sure the queue is synchronised. clFinish(subQueue); // Update the timer. timer(&timers[i].cpu_end, &timers[i].wall_end); timers[i].cpu_total = timers[i].cpu_total + (timers[i].cpu_end - timers[i].cpu_start); timers[i].wall_total = timers[i].wall_total + (timers[i].wall_end - timers[i].wall_start); timers[i].calls = timers[i].calls + 1; } /** * Pause a timer. */ void timer_pause(int i) { // Make sure the queue is synchronised. clFinish(subQueue); timer(&timers[i].cpu_end, &timers[i].wall_end); timers[i].cpu_total = timers[i].cpu_total + (timers[i].cpu_end - timers[i].cpu_start); timers[i].wall_total = timers[i].wall_total + (timers[i].wall_end - timers[i].wall_start); } /** * Restart a timer. */ void timer_restart(int i) { // Make sure the queue is synchronised. clFinish(subQueue); timer(&timers[i].cpu_start, &timers[i].wall_start); } /** * Clean up. */ void timer_finalize() { free(timers); } /** * Return the CPU timer total. */ double timer_cpu_total(int i) { return timers[i].cpu_total; } /** * Return the CPU timer average. */ double timer_cpu_avg(int i) { if (timers[i].calls == 0) { return 0; } return timers[i].cpu_total / timers[i].calls; } /** * Return the Wall timer total. */ double timer_wall_total(int i) { return timers[i].wall_total; } /** * Return the Wall timer average. */ double timer_wall_avg(int i) { if (timers[i].calls == 0) { return 0; } return timers[i].wall_total / timers[i].calls; } /** * Print the timer format information. */ void timer_print_format() { printf(" NAME :\tCPU TOTAL\tWALL TOTAL\tCPU AVG\tWALL AVG\tCALLS\n"); } /** * Print the timer information in a sensible way. */ void timer_print(const char* name, int i) { // Prints as 'Name: CPU_TOTAL WALL_TOTAL CPU_AVG WALL_AVG CALLS printf(" %s:\t%f\t%f\t%f\t%f\t%d\n", name, timer_cpu_total(i), timer_wall_total(i), timer_cpu_avg(i), timer_wall_avg(i), timers[i].calls); } // If we are NOT timing, they are empty functions. #else void timer_init(int number) {} void timer_reset() {} void timer_start(int i) {} void timer_stop(int i) {} void timer_pause(int i) {} void timer_restart(int i) {} void timer_finalize() {} double timer_cpu_total(int i) { return 0.0f; } double timer_cpu_avg(int i) { return 0.0f; } double timer_wall_total(int i) { return 0.0f; } double timer_wall_avg(int i) { return 0.0f; } void timer_print_format() {} void timer_print(const char* name, int i) {} #endif util.c0000644000175600017620000001435411561531775010404 0ustar sjpsjp#include "applu.h" #include "util.h" #include "timers.h" #include #include #include #include /** * Prints a help message. */ static void print_help(void) { fprintf(stderr, "OpenCL implementation of the NAS-LU benchmark.\n"); fprintf(stderr, "Usage: lu [OPTIONS]...\n"); fprintf(stderr, "\t -h, --help\t Print a summary of the options.\n"); fprintf(stderr, "\t -p, --platform=PLATFORM\t Set the OpenCL platform. Acceptable options for PLATFORM are: amd, intel, nvidia.\n"); fprintf(stderr, "\t -d, --device=DEVICE\t Set the device type. Acceptable options for DEVICE are: cpu, gpu.\n"); fprintf(stderr, "\t -w, --work-distribution=DISTRIBUTION\t Set the work-item distribution. Acceptable options for DISTRIBUTION are: fine, coarse.\n"); fprintf(stderr, "\t -k, --kernels=KERNELS\t Set the kernel type. Acceptable options for KERNELS are: scalar, vector\n"); fprintf(stderr, "\t -l, --layout=LAYOUT\t Set the memory layout for kernels. Acceptable options for LAYOUT are: aos (array of structs) or soa (struct of arrays)\n"); fprintf(stderr, "\t -b, --blocking=BLOCKING\t Set the k-blocking policy for wavefront section. Acceptable options for BLOCKING are: old or new\n"); fprintf(stderr, "\t -f, --fission=ON/OFF\t Enable/disable device fission. Enabling fission splits multi-socket CPUs into separate devices.\n"); } // Command line options. static struct option long_opts[] = { { "help", 0, NULL, 'h' }, { "platform", 1, NULL, 'p' }, { "device", 1, NULL, 'd' }, { "work-distribution", 1, NULL, 'w' }, { "kernels", 1, NULL, 'k' }, { "layout", 1, NULL, 'l' }, { "blocking", 1, NULL, 'b' }, { "fission", 1, NULL, 'f' }, }; #define GETOPTS "h:p:d:k:l:b:f" /** * Parse the command line options. */ void parse_options(int argc, char* argv[]) { // Parse command line options. int optc; // Default options: amd, cpu, scalar char* platform = "amd"; char* device = "cpu"; char* distribution = "fine"; char* kernels = "scalar"; char* layout = "aos"; char* blocking = "old"; char* fission = "off"; while ((optc = getopt_long(argc, argv, GETOPTS, long_opts, NULL)) != -1) { switch (optc) { case 'h': print_help(); exit(EXIT_SUCCESS); break; case 'p': platform = optarg; break; case 'd': device = optarg; break; case 'w': distribution = optarg; break; case 'k': kernels = optarg; break; case 'l': layout = optarg; break; case 'b': blocking = optarg; break; case 'f': fission = optarg; break; default: print_help(); exit(EXIT_FAILURE); break; } } // Check for platform. if (strcmp(platform, "amd") == 0) { opt_platform = APPLU_PLATFORM_AMD; } else if (strcmp(platform, "nvidia") == 0) { opt_platform = APPLU_PLATFORM_NVIDIA; } else if (strcmp(platform, "intel") == 0) { opt_platform = APPLU_PLATFORM_INTEL; } else { fprintf(stderr, "Unrecognised platform: %s.\n", platform); exit(EXIT_FAILURE); } // Check for device type. if (strcmp(device, "cpu") == 0) { opt_device = APPLU_DEVICE_CPU; } else if (strcmp(device, "gpu") == 0) { opt_device = APPLU_DEVICE_GPU; } else { fprintf(stderr, "Unrecognised device type: %s.\n", device); exit(EXIT_FAILURE); } // Check for work-item distribution. if (strcmp(distribution, "fine") == 0) { opt_distribution = APPLU_DISTRIBUTION_FINE; } else if (strcmp(distribution, "coarse") == 0) { opt_distribution = APPLU_DISTRIBUTION_COARSE; } else { fprintf(stderr, "Unrecognised work-item distribution: %s.\n", distribution); exit(EXIT_FAILURE); } // Check for kernel type. if (strcmp(kernels, "scalar") == 0) { opt_kernels = APPLU_KERNELS_SCALAR; } else if (strcmp(kernels, "vector") == 0) { opt_kernels = APPLU_KERNELS_VECTOR; } else { fprintf(stderr, "Unrecognised kernel type: %s.\n", kernels); exit(EXIT_FAILURE); } // Check for layout type. if (strcmp(layout, "aos") == 0) { opt_layout = APPLU_LAYOUT_AOS; if (opt_kernels == APPLU_KERNELS_VECTOR) { fprintf(stderr, " Error: Cannot use vector kernels with AoS layout.\n"); exit(EXIT_FAILURE); } } else if (strcmp(layout, "soa") == 0) { opt_layout = APPLU_LAYOUT_SOA; } else { fprintf(stderr, "Unrecognised layout type: %s.\n", layout); exit(EXIT_FAILURE); } // Check for blocking type. if (strcmp(blocking, "old") == 0) { opt_blocking = APPLU_BLOCKING_OLD; } else if (strcmp(blocking, "new") == 0) { opt_blocking = APPLU_BLOCKING_NEW; } else { fprintf(stderr, "Unrecognised blocking type: %s.\n", blocking); exit(EXIT_FAILURE); } // Check for device fission option. if (strcmp(fission, "off") == 0) { opt_fission = APPLU_FISSION_OFF; } else if (strcmp(fission, "on") == 0) { opt_fission = APPLU_FISSION_ON; } else { fprintf(stderr, "Unrecognised fission type: %s.\n", fission); exit(EXIT_FAILURE); } } /** * Allocate buffers. */ void allocate_buffers() { // Allocate memory for solution arrays. u = calloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5, sizeof(double) ); rsd = calloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5, sizeof(double) ); frct = calloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5, sizeof(double) ); flux = calloc( isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5, sizeof(double) ); if (!u || !rsd || !frct || !flux) { fprintf(stderr, "Could not allocate memory for solution arrays.\n"); exit(EXIT_FAILURE); } // Allocate memory for communication buffers. buf = calloc( 10 * isiz3 * isiz2, sizeof(double) ); buf1 = calloc( 10 * isiz3 * isiz2, sizeof(double) ); if (!buf || !buf1) { fprintf(stderr, "Could not allocate memory for communication buffers.\n"); exit(EXIT_FAILURE); } } /** * Free memory for all arrays. */ void free_buffers() { // Free solution arrays. free(u); free(rsd); free(frct); free(flux); // Free communication buffers. free(buf); free(buf1); } /** * Print all of the timers. */ void print_timers() { printf(" TIMING RESULTS:\n"); timer_print_format(); timer_print("SSOR ", 0); timer_print("PRE ", 1); timer_print("JACLD ", 2); timer_print("BLTS ", 3); timer_print("JACU ", 4); timer_print("BUTS ", 5); timer_print("POST ", 6); timer_print("L2NORM", 7); timer_print("RHS ", 8); timer_print("REARR ", 9); timer_print("EX1 ", 10); timer_print("EX3 ", 11); } util_cl.c0000644000175600017620000005015211570737742011060 0ustar sjpsjp#include "applu.h" #include "applu_cl.h" #include #include #include #include #include "util.h" /** * Dirty hack for swapping cl_mem pointers. */ inline void swap_pointers(cl_mem* x, cl_mem* y) { cl_mem tmp = *x; *x = *y; *y = tmp; } /** * Build a single OpenCL kernel. * TODO: Generalise this such that it takes a list of include files. */ cl_kernel build_kernel(char* kernel_directory, char* filename, char* kernel_name) { cl_int status; char* full_filename = malloc( (strlen(kernel_directory) + strlen(filename) + 1) * sizeof(char) ); strcpy(full_filename, kernel_directory); strcat(full_filename, filename); if (id == 0) { printf(" Building %s - %s.\n", full_filename, kernel_name); } // Array of source strings. char* source[4]; // The first source string is a header file constructed here: double tmp = 1.0e+00 / ( omega * ( 2.0e+00 - omega ) ); source[ 0] = (char*) malloc(4096 * sizeof(char)); if (!source[0]) { fprintf(stderr, "Failed to malloc source[0]\n"); exit(EXIT_FAILURE); } sprintf(source[ 0], "#define nnodes_compiled %d\n#define isiz01 %d\n#define isiz02 %d\n#define isiz03 %d\n#define isiz1 %d\n#define isiz2 %d\n#define isiz3 %d\n#define itmax_default %d\n#define inorm_default %d\n#define dt_default %f\n#define kblock %d\n#define problem_height %d\n#define ist %d\n#define iend %d\n#define jst %d\n#define jend %d\n#define nx %d\n#define ny %d\n#define nz %d\n#define tx1 %f\n#define tx2 %f\n#define tx3 %f\n#define ty1 %f\n#define ty2 %f\n#define ty3 %f\n#define tz1 %f\n#define tz2 %f\n#define tz3 %f\n#define dx1 %f\n#define dx2 %f\n#define dx3 %f\n#define dx4 %f\n#define dx5 %f\n#define dy1 %f\n#define dy2 %f\n#define dy3 %f\n#define dy4 %f\n#define dy5 %f\n#define dz1 %f\n#define dz2 %f\n#define dz3 %f\n#define dz4 %f\n#define dz5 %f\n#define omega %f\n#define dt %f\n#define dssp %f\n#define north %d\n#define south %d\n#define east %d\n#define west %d\n#define rhsblock_x %d\n#define rhsblock_y %d\n#define rhsgrid_x %d\n#define rhsgrid_y %d\n#define waveblock_x %d\n#define waveblock_y %d\n#define max_buffer_size %ld\n", nnodes_compiled, isiz01, isiz02, isiz03, isiz1, isiz2, isiz3, itmax_default, inorm_default, dt_default, kblock, problem_height, ist, iend, jst, jend, nx, ny, nz, tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, dy5, dz1, dz2, dz3, dz4, dz5, omega, dt, dssp, north, south, east, west, rhsblock[0], rhsblock[1], (int) ceil((isiz1 + 4) / (double) rhsblock[0]), (int) ceil((isiz1 + 4) / (double) rhsblock[1]), waveblock[0], waveblock[1], max_buffer_size); // Determine new or old policy. if (opt_blocking == APPLU_BLOCKING_OLD) { strcat(source[ 0], "#define APPLU_BLOCKING_OLD 1\n"); } else if (opt_blocking == APPLU_BLOCKING_NEW) { strcat(source[ 0], "#define APPLU_BLOCKING_NEW 1\n"); } // Load a platform-specific header. if (opt_platform == APPLU_PLATFORM_AMD) { source[ 1] = wclLoadProgramSource("./kernels/amd.clh"); } else if (opt_platform == APPLU_PLATFORM_NVIDIA) { source[ 1] = wclLoadProgramSource("./kernels/nvidia.clh"); } else if (opt_platform == APPLU_PLATFORM_INTEL) { source[ 1] = wclLoadProgramSource("./kernels/intel.clh"); } // Load a layout-specific header. if (opt_layout == APPLU_LAYOUT_AOS) { source[ 2] = wclLoadProgramSource("./kernels/aos.clh"); } else if (opt_layout == APPLU_LAYOUT_SOA) { source[ 2] = wclLoadProgramSource("./kernels/soa.clh"); } source[ 3] = wclLoadProgramSource(full_filename); // Build the program from source. cl_program program = clCreateProgramWithSource(subContext, 4, (const char**) &source, NULL, &status); wclCheckError(status, CL_SUCCESS, " Error: Could not create program from source: "); // Compile the program. if (opt_platform == APPLU_PLATFORM_AMD) { status = clBuildProgram(program, 0, NULL, "-Werror", NULL, NULL); } else if (opt_platform == APPLU_PLATFORM_NVIDIA) { status = clBuildProgram(program, 0, NULL, "-Werror -cl-nv-opt-level=3 -cl-nv-verbose -cl-mad-enable", NULL, NULL); } else if (opt_platform == APPLU_PLATFORM_INTEL) { status = clBuildProgram(program, 0, NULL, "-Werror", NULL, NULL); } // Wait until the build has finished. cl_build_status build_status = CL_BUILD_IN_PROGRESS; while (build_status == CL_BUILD_IN_PROGRESS) { build_status = clGetProgramBuildInfo(program, subDevice, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status, NULL); if (build_status == CL_BUILD_SUCCESS) { //printf(" Build Status: SUCCESS\n"); } else if (build_status == CL_BUILD_ERROR) { //printf(" Build Status: ERROR\n"); } else if (build_status == CL_BUILD_NONE) { //printf(" Build Status: NONE\n"); } else if (build_status == CL_BUILD_IN_PROGRESS) { //printf(" Build Status: IN PROGRESS\n"); } } // Get the build log. size_t len = 0; char* buffer = (char*) malloc( 65536 * sizeof(char) ); //printf(" Build Log:\n"); status = clGetProgramBuildInfo(program, subDevice, CL_PROGRAM_BUILD_LOG, 65536 * sizeof(char), buffer, &len); wclCheckError(status, CL_SUCCESS, " Error: Failed to retrieve build log.\n"); char* buffer2 = (char*) malloc( len * sizeof(char) ); strncpy(buffer2, buffer, len); printf("%s\n", buffer2); free(buffer2); free(buffer); if (build_status != CL_BUILD_SUCCESS) { printf(" Error: Failed to build program executable: "); exit(EXIT_FAILURE); } // If we're running on a GPU, grab the PTX and dump it. /*if (opt_device == APPLU_DEVICE_GPU) { printf("SOURCE:\n"); printf(source[0]); printf(source[1]); printf(source[2]); printf(source[3]); //printf(source[4]); printf("\n"); size_t binary_size = 0; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); char* binary = (char*) malloc( binary_size * sizeof(char) ); clGetProgramInfo(program, CL_PROGRAM_BINARIES, binary_size * sizeof(char), &binary, NULL); int i = 0; printf("BINARY:\n"); for (i = 0; i < binary_size; i++) { printf("%c", binary[i]); } printf("\n"); fflush(stdout); free(binary); }*/ // Create the kernels from this program. cl_kernel kernel = clCreateKernel(program, kernel_name, &status); wclCheckError(status, CL_SUCCESS, " Error: Failed to create kernel.\n"); clReleaseProgram(program); free(source[0]); free(source[1]); free(source[2]); free(source[3]); return kernel; } /** * Builds all of the OpenCL kernels. */ void build_kernels() { cl_uint status; char* kernel_directory; if (opt_kernels == APPLU_KERNELS_SCALAR) { kernel_directory = "./kernels/scalar/"; } else if (opt_kernels == APPLU_KERNELS_VECTOR) { kernel_directory = "./kernels/vector/"; } blts_kernel = build_kernel(kernel_directory, "blts.cl", "blts_kernel"); buts_kernel = build_kernel(kernel_directory, "buts.cl", "buts_kernel"); l2norm_kernel = build_kernel(kernel_directory, "l2norm.cl", "l2norm_kernel"); pre_kernel = build_kernel(kernel_directory, "pre.cl", "pre_kernel"); post_kernel = build_kernel(kernel_directory, "post.cl", "post_kernel"); rhs_setup_kernel = build_kernel(kernel_directory, "rhs/rhs_setup.cl", "rhs_setup_kernel"); rhs_xi_kernel = build_kernel(kernel_directory, "rhs/rhs_xi.cl", "rhs_xi_kernel"); rhs_eta_kernel = build_kernel(kernel_directory, "rhs/rhs_eta.cl", "rhs_eta_kernel"); rhs_zeta_kernel = build_kernel(kernel_directory, "rhs/rhs_zeta.cl", "rhs_zeta_kernel"); rhs_xi1_kernel = build_kernel(kernel_directory, "rhs/xi/rhs_xi1.cl", "rhs_xi1_kernel"); rhs_xi2_kernel = build_kernel(kernel_directory, "rhs/xi/rhs_xi2.cl", "rhs_xi2_kernel"); rhs_xi3_kernel = build_kernel(kernel_directory, "rhs/xi/rhs_xi3.cl", "rhs_xi3_kernel"); rhs_xi4_kernel = build_kernel(kernel_directory, "rhs/xi/rhs_xi4.cl", "rhs_xi4_kernel"); rhs_xi_dissipation_kernel = build_kernel(kernel_directory, "rhs/xi/rhs_xi_dissipation.cl", "rhs_xi_dissipation_kernel"); rhs_eta1_kernel = build_kernel(kernel_directory, "rhs/eta/rhs_eta1.cl", "rhs_eta1_kernel"); rhs_eta2_kernel = build_kernel(kernel_directory, "rhs/eta/rhs_eta2.cl", "rhs_eta2_kernel"); rhs_eta3_kernel = build_kernel(kernel_directory, "rhs/eta/rhs_eta3.cl", "rhs_eta3_kernel"); rhs_eta4_kernel = build_kernel(kernel_directory, "rhs/eta/rhs_eta4.cl", "rhs_eta4_kernel"); rhs_eta_dissipation_kernel = build_kernel(kernel_directory, "rhs/eta/rhs_eta_dissipation.cl", "rhs_eta_dissipation_kernel"); rhs_zeta1_kernel = build_kernel(kernel_directory, "rhs/zeta/rhs_zeta1.cl", "rhs_zeta1_kernel"); rhs_zeta2_kernel = build_kernel(kernel_directory, "rhs/zeta/rhs_zeta2.cl", "rhs_zeta2_kernel"); rhs_zeta3_kernel = build_kernel(kernel_directory, "rhs/zeta/rhs_zeta3.cl", "rhs_zeta3_kernel"); rhs_zeta4_kernel = build_kernel(kernel_directory, "rhs/zeta/rhs_zeta4.cl", "rhs_zeta4_kernel"); rhs_zeta_dissipation_kernel = build_kernel(kernel_directory, "rhs/zeta/rhs_zeta_dissipation.cl", "rhs_zeta_dissipation_kernel"); flat_to_hyperplane_kernel = build_kernel(kernel_directory, "rearrangement.cl", "flat_to_hyperplane_kernel"); hyperplane_to_flat_kernel = build_kernel(kernel_directory, "rearrangement.cl", "hyperplane_to_flat_kernel"); flat_to_tiled_kernel = build_kernel(kernel_directory, "rearrangement.cl", "flat_to_tiled_kernel"); tiled_to_flat_kernel = build_kernel(kernel_directory, "rearrangement.cl", "tiled_to_flat_kernel"); tiled_to_hyperplane_kernel = build_kernel(kernel_directory, "rearrangement.cl", "tiled_to_hyperplane_kernel"); hyperplane_to_tiled_kernel = build_kernel(kernel_directory, "rearrangement.cl", "hyperplane_to_tiled_kernel"); ex1_unpack_north_kernel = build_kernel(kernel_directory, "ex1_unpack.cl", "ex1_unpack_north_kernel"); ex1_unpack_west_kernel = build_kernel(kernel_directory, "ex1_unpack.cl", "ex1_unpack_west_kernel"); ex1_unpack_south_kernel = build_kernel(kernel_directory, "ex1_unpack.cl", "ex1_unpack_south_kernel"); ex1_unpack_east_kernel = build_kernel(kernel_directory, "ex1_unpack.cl", "ex1_unpack_east_kernel"); ex1_pack_north_kernel = build_kernel(kernel_directory, "ex1_pack.cl", "ex1_pack_north_kernel"); ex1_pack_west_kernel = build_kernel(kernel_directory, "ex1_pack.cl", "ex1_pack_west_kernel"); ex1_pack_south_kernel = build_kernel(kernel_directory, "ex1_pack.cl", "ex1_pack_south_kernel"); ex1_pack_east_kernel = build_kernel(kernel_directory, "ex1_pack.cl", "ex1_pack_east_kernel"); ex3_unpack_north_kernel = build_kernel(kernel_directory, "ex3_unpack.cl", "ex3_unpack_north_kernel"); ex3_unpack_west_kernel = build_kernel(kernel_directory, "ex3_unpack.cl", "ex3_unpack_west_kernel"); ex3_unpack_south_kernel = build_kernel(kernel_directory, "ex3_unpack.cl", "ex3_unpack_south_kernel"); ex3_unpack_east_kernel = build_kernel(kernel_directory, "ex3_unpack.cl", "ex3_unpack_east_kernel"); ex3_pack_north_kernel = build_kernel(kernel_directory, "ex3_pack.cl", "ex3_pack_north_kernel"); ex3_pack_west_kernel = build_kernel(kernel_directory, "ex3_pack.cl", "ex3_pack_west_kernel"); ex3_pack_south_kernel = build_kernel(kernel_directory, "ex3_pack.cl", "ex3_pack_south_kernel"); ex3_pack_east_kernel = build_kernel(kernel_directory, "ex3_pack.cl", "ex3_pack_east_kernel"); memset_double_kernel = build_kernel(kernel_directory, "rearrangement.cl", "memset_double_kernel"); if (opt_device == APPLU_DEVICE_CPU) { print_mem_kernel = build_kernel(kernel_directory, "print.cl", "print_mem_kernel"); } } /** * Frees all of the OpenCL kernels. */ void free_kernels() { clReleaseKernel(blts_kernel); clReleaseKernel(buts_kernel); clReleaseKernel(l2norm_kernel); clReleaseKernel(pre_kernel); clReleaseKernel(post_kernel); clReleaseKernel(rhs_setup_kernel); clReleaseKernel(rhs_xi_kernel); clReleaseKernel(rhs_eta_kernel); clReleaseKernel(rhs_zeta_kernel); clReleaseKernel(rhs_xi1_kernel); clReleaseKernel(rhs_xi2_kernel); clReleaseKernel(rhs_xi3_kernel); clReleaseKernel(rhs_xi4_kernel); clReleaseKernel(rhs_xi_dissipation_kernel); clReleaseKernel(rhs_eta1_kernel); clReleaseKernel(rhs_eta2_kernel); clReleaseKernel(rhs_eta3_kernel); clReleaseKernel(rhs_eta4_kernel); clReleaseKernel(rhs_eta_dissipation_kernel); clReleaseKernel(rhs_zeta1_kernel); clReleaseKernel(rhs_zeta2_kernel); clReleaseKernel(rhs_zeta3_kernel); clReleaseKernel(rhs_zeta4_kernel); clReleaseKernel(rhs_zeta_dissipation_kernel); clReleaseKernel(flat_to_hyperplane_kernel); clReleaseKernel(hyperplane_to_flat_kernel); clReleaseKernel(flat_to_tiled_kernel); clReleaseKernel(tiled_to_flat_kernel); clReleaseKernel(tiled_to_hyperplane_kernel); clReleaseKernel(hyperplane_to_tiled_kernel); clReleaseKernel(ex1_unpack_north_kernel); clReleaseKernel(ex1_unpack_west_kernel); clReleaseKernel(ex1_unpack_south_kernel); clReleaseKernel(ex1_unpack_east_kernel); clReleaseKernel(ex1_pack_north_kernel); clReleaseKernel(ex1_pack_west_kernel); clReleaseKernel(ex1_pack_south_kernel); clReleaseKernel(ex1_pack_east_kernel); clReleaseKernel(ex3_unpack_north_kernel); clReleaseKernel(ex3_unpack_west_kernel); clReleaseKernel(ex3_unpack_south_kernel); clReleaseKernel(ex3_unpack_east_kernel); clReleaseKernel(ex3_pack_north_kernel); clReleaseKernel(ex3_pack_west_kernel); clReleaseKernel(ex3_pack_south_kernel); clReleaseKernel(ex3_pack_east_kernel); clReleaseKernel(memset_double_kernel); if (opt_device == APPLU_DEVICE_CPU) { clReleaseKernel(print_mem_kernel); } } /** * Creates all of the OpenCL buffers. */ void allocate_cl_buffers() { size_t flat_size = isiz3 * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double); size_t hyper_size = problem_height * (isiz2 + 4) * (isiz1 + 4) * 5 * sizeof(double); size_t tiled_size = problem_height * (rhsblock[0] * rhsgrid[0]) * (rhsblock[1] * rhsgrid[1]) * 5 * sizeof(double); if (flat_size > hyper_size) { max_buffer_size = flat_size; } else { max_buffer_size = hyper_size; } if (tiled_size > max_buffer_size) { max_buffer_size = tiled_size; } if (id == 0) { printf(" sizeof(max_buffer_size) = %lu, Max Buffer Size = %ld\n", sizeof(max_buffer_size), max_buffer_size); } size_t memory = (5 * sizeof(double)) + 5 * (max_buffer_size) + (((isiz1 + 4) + (isiz2 + 4)) * sizeof(int)) + (((isiz1 + 4) + (isiz2 + 4) + kblock - 1) * sizeof(int)) + (((isiz1 + 4) * (isiz2 + 4)) * sizeof(int)) + 2 * (kblock * (isiz2 + 4) * (isiz1 + 4) * sizeof(int)); size_t gigabytes = memory / 1024 / 1024 / 1024; size_t megabytes = (memory - (gigabytes * (1024 * 1024 * 1024))) / 1024 / 1024; size_t kilobytes = (memory - (gigabytes * (1024 * 1024 * 1024)) - (megabytes * (1024 * 1024))) / 1024; size_t bytes = (memory - (gigabytes * (1024 * 1024 * 1024)) - (megabytes * (1024 * 1024)) - (kilobytes * 1024)); if (id == 0) { printf(" Required Device Memory = %ld GB, %ld MB, %ld KB, %ld B\n", gigabytes, megabytes, kilobytes, bytes); } cl_int status; sum_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, 5 * sizeof(double), NULL, &status); u_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, max_buffer_size, NULL, &status); clEnqueueWriteBuffer(subQueue, u_d, CL_TRUE, 0, flat_size, u, 0, NULL, NULL); rsd_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, max_buffer_size, NULL, &status); clEnqueueWriteBuffer(subQueue, rsd_d, CL_TRUE, 0, flat_size, rsd, 0, NULL, NULL); flux_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, max_buffer_size, NULL, &status); clEnqueueWriteBuffer(subQueue, flux_d, CL_TRUE, 0, flat_size, flux, 0, NULL, NULL); frct_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, max_buffer_size, NULL, &status); clEnqueueWriteBuffer(subQueue, frct_d, CL_TRUE, 0, flat_size, frct, 0, NULL, NULL); int lookup_height = 0; if (opt_blocking == APPLU_BLOCKING_OLD) { lookup_height = kblock; } else if (opt_blocking == APPLU_BLOCKING_NEW) { lookup_height = isiz3; } wavefront_offsets_2d_d = clCreateBuffer(subContext, CL_MEM_READ_ONLY, ((isiz1 + 4) + (isiz2 + 4)) * sizeof(int), NULL, &status); wavefront_offsets_3d_d = clCreateBuffer(subContext, CL_MEM_READ_ONLY, ((isiz1 + 4) + (isiz2 + 4) + lookup_height) * sizeof(int), NULL, &status); columns_d = clCreateBuffer(subContext, CL_MEM_READ_ONLY, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), NULL, &status); rows_d = clCreateBuffer(subContext, CL_MEM_READ_ONLY, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), NULL, &status); thread_mapping_d = clCreateBuffer(subContext, CL_MEM_READ_ONLY, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), NULL, &status); rearrangement_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, max_buffer_size, NULL, &status); ibuffer_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, kblock * 5 * (iend - ist + 1) * sizeof(double), NULL, &status); jbuffer_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, kblock * 5 * (jend - jst + 1) * sizeof(double), NULL, &status); buf_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, 10 * isiz3 * isiz2 * sizeof(double), NULL, &status); buf1_d = clCreateBuffer(subContext, CL_MEM_READ_WRITE, 10 * isiz3 * isiz2 * sizeof(double), NULL, &status); wclCheckError(status, CL_SUCCESS, "Error: Failed to create device buffers.\n"); } /** * Frees all of the OpenCL buffers. */ void free_cl_buffers() { clReleaseMemObject(sum_d); clReleaseMemObject(u_d); clReleaseMemObject(rsd_d); clReleaseMemObject(flux_d); clReleaseMemObject(frct_d); clReleaseMemObject(wavefront_offsets_2d_d); clReleaseMemObject(wavefront_offsets_3d_d); clReleaseMemObject(columns_d); clReleaseMemObject(rows_d); clReleaseMemObject(thread_mapping_d); clReleaseMemObject(rearrangement_d); clReleaseMemObject(ibuffer_d); clReleaseMemObject(jbuffer_d); clReleaseMemObject(buf_d); clReleaseMemObject(buf1_d); } // Prepare lookup tables. void prepare_lookup_tables(int* wavefront_offsets_2d, int* wavefront_offsets_3d, int* columns, int* rows, int* thread_mapping) { int wavefront = 0; int counter = 0; // Calculate the 2D offsets. for (wavefront = 0; wavefront < ( (isiz1 + 4) + (isiz2 + 4) ); wavefront++) { wavefront_offsets_2d[wavefront] = counter; int i, j; // Note: Has to be for i, for j in order to maintain counting order (from bottom left). for (i = 0; i < (isiz1 + 4); i++) { // No point checking if this is true. if (i > wavefront) { break; } j = wavefront - i; if (j < (isiz2 + 4)) { thread_mapping[(j * (isiz1 + 4)) + i] = counter; columns[counter] = i; rows[counter] = j; counter++; } } } wavefront = 0; counter = 0; int lookup_height = 0; if (opt_blocking == APPLU_BLOCKING_OLD) { lookup_height = kblock; } else if (opt_blocking == APPLU_BLOCKING_NEW) { lookup_height = isiz3; } // Calculate the 3D offsets. for (wavefront = 0; wavefront < ( (isiz1 + 4) + (isiz2 + 4) + lookup_height - 1 ); wavefront++) { wavefront_offsets_3d[wavefront] = counter; int k, j, i; for (k = 0; k < lookup_height; k++) { // No point checking if this is true. if (k > wavefront) { break; } for (j = 0; j < (isiz2 + 4); j++) { // Also no point checking if this is true. if (j + k > wavefront) { break; } i = wavefront - (j + k); if (i < (isiz1 + 4)) { counter++; } } } } // Copy across to the OpenCL buffers. clEnqueueWriteBuffer(subQueue, wavefront_offsets_2d_d, CL_TRUE, 0, ((isiz1 + 4) + (isiz2 + 4)) * sizeof(int), wavefront_offsets_2d, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, wavefront_offsets_3d_d, CL_TRUE, 0, ((isiz1 + 4) + (isiz2 + 4) + lookup_height) * sizeof(int), wavefront_offsets_3d, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, columns_d, CL_TRUE, 0, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), columns, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, rows_d, CL_TRUE, 0, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), rows, 0, NULL, NULL); clEnqueueWriteBuffer(subQueue, thread_mapping_d, CL_TRUE, 0, (isiz1 + 4) * (isiz2 + 4) * sizeof(int), thread_mapping, 0, NULL, NULL); } /** * Print the contents of a cl_mem buffer. */ void print_mem(cl_mem memory, int n) { cl_int status; status = clSetKernelArg(print_mem_kernel, 0, sizeof(cl_mem), (void*) &memory); status |= clSetKernelArg(print_mem_kernel, 1, sizeof(int), (void*) &n); size_t global = 1; size_t local = 1; status = clEnqueueNDRangeKernel(subQueue, print_mem_kernel, 1, NULL, &global, &local, 0, NULL, NULL); wclCheckError(status, CL_SUCCESS, " Error: Could not launch print_mem_kernel.\n"); } verify.c0000644000175600017620000002011111553632710010707 0ustar sjpsjp// C port of NPB3.2 // subroutine verify(xcr, xce, xci, class, verified) #include "applu.h" #include "mpinpb.h" #include #include /** * Verification routine. * Returns 1 / 0 depending on success / failure. */ int verify(double xcr[5], double xce[5], double xci, char class) { /** * Local variables. */ double xcrref[5], xceref[5], xciref, xcrdif[5], xcedif[5], xcidif, epsilon, dtref; int m; int verified; // Tolerance level. epsilon = 1.0e-08; class = 'U'; verified = 1; for (m = 0; m < 5; m++) { xcrref[m] = 1.0; xceref[m] = 1.0; } xciref = 1.0; // Check for CLASS = S. if (nx0 == 12 && ny0 == 12 && nz0 == 12 && itmax == 50) { class = 'S'; dtref = 5.0e-1; // Reference values of RMS-norms of residual, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1; xcrref[0] = 1.6196343210976702e-02; xcrref[1] = 2.1976745164821318e-03; xcrref[2] = 1.5179927653399185e-03; xcrref[3] = 1.5029584435994323e-03; xcrref[4] = 3.4264073155896461e-02; // Reference values of RMS-norms of solution error, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1. xceref[0] = 6.4223319957960924e-04; xceref[1] = 8.4144342047347926e-05; xceref[2] = 5.8588269616485186e-05; xceref[3] = 5.8474222595157350e-05; xceref[4] = 1.3103347914111294e-03; // Reference values of surface integral, for the 12x12x12 grid, // after 50 time steps, with DT = 5.0e-1. xciref = 7.8418928865937083e+00; } // Check for CLASS = W. else if (nx0 == 33 && ny0 == 33 && nz0 == 33 && itmax == 300) { class = 'W'; dtref = 1.5e-3; // Reference values of RMS-norms of residual, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xcrref[0] = 0.1236511638192e+02; xcrref[1] = 0.1317228477799e+01; xcrref[2] = 0.2550120713095e+01; xcrref[3] = 0.2326187750252e+01; xcrref[4] = 0.2826799444189e+02; // Reference values for RMS-norms of solution error, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xceref[0] = 0.4867877144216e+00; xceref[1] = 0.5064652880982e-01; xceref[2] = 0.9281818101960e-01; xceref[3] = 0.8570126542733e-01; xceref[4] = 0.1084277417792e+01; // Reference value of surface integral, for the 33x33x33 grid, // after 300 time steps, with DT = 1.5d-3. xciref = 0.1161399311023e+02; } // Check for CLASS = A. else if (nx0 == 64 && ny0 == 64 && nz0 == 64 && itmax == 250) { class = 'A'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0e+00. xcrref[0] = 7.7902107606689367e+02; xcrref[1] = 6.3402765259692870e+01; xcrref[2] = 1.9499249727292479e+02; xcrref[3] = 1.7845301160418537e+02; xcrref[4] = 1.8384760349464247e+03; // Reference values of RMS-norms of solution error, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0d+00. xceref[0] = 2.9964085685471943e+01; xceref[1] = 2.8194576365003349e+00; xceref[2] = 7.3473412698774742e+00; xceref[3] = 6.7139225687777051e+00; xceref[4] = 7.0715315688392578e+01; // Reference value of surface integral, for the 64x64x64 grid, // after 250 time steps, with DT = 2.0d+00. xciref = 2.6030925604886277e+01; } // Check for CLASS = B. else if (nx0 == 102 && ny0 == 102 && nz0 == 102 && itmax == 250) { class = 'B'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the (102X102X102) grid, // after 250 time steps, with DT = 2.0d+00 xcrref[0] = 3.5532672969982736e+03; xcrref[1] = 2.6214750795310692e+02; xcrref[2] = 8.8333721850952190e+02; xcrref[3] = 7.7812774739425265e+02; xcrref[4] = 7.3087969592545314e+03; // Reference values of RMS-norms of solution error, for the (102X102X102) // grid, after 250 time steps, with DT = 2.0d+00 xceref[0] = 1.1401176380212709e+02; xceref[1] = 8.1098963655421574e+00; xceref[2] = 2.8480597317698308e+01; xceref[3] = 2.5905394567832939e+01; xceref[4] = 2.6054907504857413e+02; // Reference value of surface integral, for the (102X102X102) grid, // after 250 time steps, with DT = 2.0d+00 xciref = 4.7887162703308227e+01; } // Check for CLASS = C. else if (nx0 == 162 && ny0 == 162 && nz0 == 162 && itmax == 250) { class = 'C'; dtref = 2.0e+0; // Reference values of RMS-norms of residual, for the (162X162X162) grid, // after 250 time steps, with DT = 2.0d+00 xcrref[0] = 1.03766980323537846e+04; xcrref[1] = 8.92212458801008552e+02; xcrref[2] = 2.56238814582660871e+03; xcrref[3] = 2.19194343857831427e+03; xcrref[4] = 1.78078057261061185e+04; // Reference values of RMS-norms of solution error, for the (162X162X162) // grid, after 250 time steps, with DT = 2.0d+00 xceref[0] = 2.15986399716949279e+02; xceref[1] = 1.55789559239863600e+01; xceref[2] = 5.41318863077207766e+01; xceref[3] = 4.82262643154045421e+01; xceref[4] = 4.55902910043250358e+02; // Reference value of surface integral, for the (162X162X162) grid, // after 250 time steps, with DT = 2.0d+00 xciref = 6.66404553572181300e+01; } // Check for CLASS = D. else if (nx0 == 408 && ny0 == 408 && nz0 == 408 && itmax == 300) { class = 'D'; dtref = 1.0e+0; // Reference values of RMS-norms of residual, for the (408X408X408) grid, // after 300 time steps, with DT = 1.0d+00 xcrref[0] = 0.4868417937025e+05; xcrref[1] = 0.4696371050071e+04; xcrref[2] = 0.1218114549776e+05; xcrref[3] = 0.1033801493461e+05; xcrref[4] = 0.7142398413817e+05; // Reference values of RMS-norms of solution error, for the (408X408X408) // grid, after 300 time steps, with DT = 1.0d+00 xceref[0] = 0.3752393004482e+03; xceref[1] = 0.3084128893659e+02; xceref[2] = 0.9434276905469e+02; xceref[3] = 0.8230686681928e+02; xceref[4] = 0.7002620636210e+03; // Reference value of surface integral, for the (408X408X408) grid, // after 300 time steps, with DT = 1.0d+00 xciref = 0.8334101392503e+02; } else { verified = 0; } // Compute the difference of solution values and the known reference values. for (m = 0; m < 5; m++) { xcrdif[m] = fabs( (xcr[m] - xcrref[m]) / xcrref[m] ); xcedif[m] = fabs( (xce[m] - xceref[m]) / xceref[m] ); } xcidif = fabs( (xci - xciref) / xciref ); // Output the comparison of computed results to known cases. if (class != 'U') { printf(" Verification being performanced for class %c.\n", class); printf(" Accuracy setting for epsilon = %E.\n", epsilon); if ( fabs(dt - dtref) > epsilon) { verified = 0; class = 'U'; printf(" DT does not match the reference value of %E.\n", dtref); } } else { printf(" Unknown class.\n"); } if (class != 'U') { printf(" Comparison of RMS-norms of residual.\n"); } else { printf(" RMS-norms of residual.\n"); } for (m = 0; m < 5; m++) { if (class == 'U') { printf(" %d\t%E\n", m, xcr[m]); } else if ( xcrdif[m] > epsilon ) { verified = 0; printf(" FAILURE: %d\t%E, %E, %E\n", m, xcr[m], xcrref[m], xcrdif[m]); } else { printf(" %d\t%E, %E, %E\n", m, xcr[m], xcrref[m], xcrdif[m]); } } if (class != 'U') { printf(" Comparison of RMS-norms of solution error.\n"); } else { printf(" RMS-norms of solution error.\n"); } for (m = 0; m < 5; m++) { if (class == 'U') { printf(" %d\t%E\n", m, xce[m]); } else if ( xcedif[m] <= epsilon ) { printf(" %d\t%E, %E, %E\n", m, xce[m], xceref[m], xcedif[m]); } else { verified = 0; printf(" FAILURE: %d\t%E, %E, %E\n", m, xce[m], xceref[m], xcedif[m]); } } if (class != 'U') { printf(" Comparison of surface integral.\n"); } else { printf(" Surface integral.\n"); } if (class == 'U') { printf(" %E\n", xci); } else if ( xcidif <= epsilon ) { printf(" %E, %E, %E\n", xci, xciref, xcidif); } else { verified = 0; printf(" FAILURE: %E, %E, %E\n", xci, xciref, xcidif); } if (class == 'U') { printf(" No reference values provided.\n"); printf(" No verification performed.\n"); } else if (verified) { printf(" Verification SUCCESSFUL.\n"); } else { printf(" Verification FAILED.\n"); } return verified; }