Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f68a2c6
Update submodules
arporter Oct 21, 2025
588d403
Update PSyclone flags in tra_adv makefile
arporter Oct 23, 2025
2a8c0d2
Add new compiler-setup script for spack
arporter Oct 23, 2025
defe249
Update omp cpu and gpu scripts
arporter Oct 23, 2025
bf09178
Fix the omp_cpu_levels_trans script
arporter Oct 23, 2025
4c6aeb8
Update acc kernels managed target
arporter Oct 23, 2025
b4d3c69
Fix ACC kernels with explicit mem movement
arporter Oct 23, 2025
e29ff36
Fix acc loops with explicit mem
arporter Oct 23, 2025
1f31848
Update acc-mixed with explicit mem
arporter Oct 23, 2025
401ece2
Modernise compiler flags and fix acc-mixed-umem target
arporter Oct 23, 2025
9191008
Fix acc-loops-um
arporter Oct 23, 2025
6c5eb13
Rm path to profiling lib from spack-setup script
arporter Oct 23, 2025
87d307a
Fix Makefile for tra-adv compute_in_subroutine
arporter Oct 23, 2025
d0d6c06
Fix other versions of tra_adv benchmark
arporter Oct 23, 2025
9c5526c
Update GHA workflow file
arporter Oct 23, 2025
ba634ea
Update kokkos submodule
arporter Oct 23, 2025
b856e78
Fix NEMOLite2D acc version
arporter Oct 23, 2025
004f7e5
Update all NEMOLite2D transformation scripts
arporter Oct 23, 2025
fa9747a
Rm unused and ancient script from Shallow
arporter Oct 23, 2025
20a6db2
Update kokkos
sergisiso Dec 9, 2025
bc336ee
#101 update PSyclone to master
arporter Dec 15, 2025
cb40396
#101 update problem-size script and compiler flags
arporter Dec 15, 2025
432dc4c
#101 update psyclone scripts to handle u- and v-flather kernels
arporter Dec 15, 2025
badb92d
#101 tidy tra-adv scripts
arporter Dec 15, 2025
0408442
#101 fix nvidia compiler flags
arporter Dec 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/makefile-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ on:
jobs:
build:

runs-on: ubuntu-20.04
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
- run: python -m pip install --upgrade pip
- run: cd shared/PSyclone && pip install .
- name: Install dependencies
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ module boundary_conditions_mod
GO_STENCIL
use kernel_mod, only: kernel_type, GO_POINTWISE, GO_DOFS, &
GO_ALL_PTS, GO_INTERNAL_PTS
use physical_params_mod
use grid_mod
use field_mod
implicit none
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ KOKKOS_PATH ?= $(SHARED_DIR)/kokkos
KOKKOS_DEBUG ?= no # Careful, 10x performance penalty in kernels.
CXXFLAGS = $(CFLAGS) # Use same CFLAGS to compile Kokkos library.

# The Kokkos Makefile is deprecated, but we can still use it with:
KOKKOS_USE_DEPRECATED_MAKEFILES=1

# If no KOKKOS_DEVICES is specified, by default use the OpenMP
KOKKOS_DEVICES ?= OpenMP

Expand Down Expand Up @@ -90,7 +93,7 @@ clean:
${MAKE} -C ${INF_DIR} clean
rm -f *.o *.mod *.MOD *~ *.dat
rm -f gnu_opt_report.txt *.optrpt
rm -rf KokkosCore_* Makefile.kokkos.f90
rm -rf KokkosCore_* Makefile.kokkos.f90 desul Desul_Config.tmp

allclean: clean
rm -f *.exe fparser.log *.a
Expand Down
37 changes: 23 additions & 14 deletions benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,39 @@

from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.psyGen import TransInfo
from psyclone.psyir.nodes import Loop
from psyclone.psyir.nodes import Container, Loop, Routine
from psyclone.transformations import (
ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans,
KernelImportsToArguments)


def trans(psy):
''' Take the supplied psy object, apply OpenACC transformations
to the schedule of invoke_0 and return the new psy object '''
def trans(psyir: Container) -> None:
''' Take the supplied psyir object, apply OpenACC transformations
to the schedule of invoke_0. '''
tinfo = TransInfo()
parallel_trans = tinfo.get_trans_name('ACCParallelTrans')
loop_trans = tinfo.get_trans_name('ACCLoopTrans')
enter_data_trans = tinfo.get_trans_name('ACCEnterDataTrans')
routine_trans = tinfo.get_trans_name('ACCRoutineTrans')
glo2arg_trans = tinfo.get_trans_name('KernelImportsToArguments')
inline_trans = KernelModuleInlineTrans()
enter_data_trans = ACCEnterDataTrans()
routine_trans = ACCRoutineTrans()
glo2arg_trans = KernelImportsToArguments()
mod_inline_trans = KernelModuleInlineTrans()

invoke = psy.invokes.get('invoke_0')
schedule = invoke.schedule
schedule = psyir.walk(Routine)[0]

# Apply the OpenACC Loop transformation to *every* loop
# in the schedule
for child in schedule.children:
if isinstance(child, Loop):
loop_trans.apply(child, {"collapse": 2})
opts = {"collapse": 2}
if child.kernels()[0].name == "bc_flather_v_code":
# We need to ignore dependencies on 'va' because PSyclone
# spots that there is a dependence in the bc_flather_v kernel.
# However, we know that practically this isn't a problem
# because of the way the domain (mask) is configured.
opts["ignore_dependencies_for"] = ["va%data"]
if child.kernels()[0].name == "bc_flather_u_code":
opts["ignore_dependencies_for"] = ["ua%data"]
loop_trans.apply(child, options=opts)

# Put all of the loops in a single parallel region
parallel_trans.apply(schedule)
Expand All @@ -37,6 +48,4 @@ def trans(psy):
for kern in schedule.coded_kernels():
glo2arg_trans.apply(kern)
routine_trans.apply(kern)
inline_trans.apply(kern)

return psy
mod_inline_trans.apply(kern)
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
that PSyclone will generate an OpenCL PSy layer. '''

import os
from psyclone.psyGen import TransInfo
from psyclone.domain.gocean.transformations import \
GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans

from psyclone.domain.gocean.transformations import (
GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans)
from psyclone.configuration import Config
from psyclone.psyir.nodes import Routine
from psyclone.transformations import (
KernelImportsToArguments)


# Global variables to configure the PSyclone OpenCL generation:
Expand All @@ -33,13 +36,12 @@ def trans(psy):
''' Transform the schedule for OpenCL generation '''

# Import transformations
tinfo = TransInfo()
globaltrans = tinfo.get_trans_name('KernelImportsToArguments')
globaltrans = KernelImportsToArguments()
move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()
cltrans = GOOpenCLTrans()

# Get the invoke routine
schedule = psy.invokes.get('invoke_0').schedule
# Get the routine
schedule = psy.walk(Routine)[0]

# Map the kernels by their name to different OpenCL queues. The multiple
# command queues can be executed concurrently while each command queue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@
function via the -s option. It applies OpenMP tasking to every loop
and inlines all kernels in the schedule.'''

from psyclone.psyir.nodes import Loop
from psyclone.psyir.nodes import Container, Loop, Routine
from psyclone.configuration import Config
from psyclone.transformations import OMPParallelTrans, OMPSingleTrans, \
OMPTaskloopTrans, KernelModuleInlineTrans
from psyclone.psyir.transformations import OMPTaskwaitTrans
from psyclone.psyir.nodes import OMPTaskloopDirective, OMPTaskwaitDirective, \
OMPDirective, OMPParallelDirective
from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.transformations import (
OMPParallelTrans, OMPSingleTrans)
from psyclone.psyir.transformations import OMPTaskloopTrans, OMPTaskwaitTrans
from psyclone.psyir.nodes import (OMPTaskloopDirective, OMPTaskwaitDirective,
OMPDirective, OMPParallelDirective)


def trans(psy):
def trans(psyir: Container) -> None:
'''Transformation entry point'''
config = Config.get()

schedule = psy.invokes.get('invoke_0').schedule
schedule = psyir.walk(Routine)[0]

loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True)
wait_trans = OMPTaskwaitTrans()
Expand All @@ -28,7 +29,16 @@ def trans(psy):

for child in schedule.children:
if isinstance(child, Loop):
loop_trans.apply(child)
# We need to ignore dependencies on '{u,v}a' because PSyclone
# spots that there is a dependence in the bc_flather_{u,v} kernel.
# However, we know that practically this isn't a problem
# because of the way the domain (mask) is configured.
options = {}
if child.kernels()[0].name == "bc_flather_v_code":
options["ignore_dependencies_for"] = ["va%data"]
if child.kernels()[0].name == "bc_flather_u_code":
options["ignore_dependencies_for"] = ["ua%data"]
loop_trans.apply(child, options=options)

single_trans = OMPSingleTrans()
parallel_trans = OMPParallelTrans()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,22 @@
from psyclone.configuration import Config
from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.psyGen import TransInfo
from psyclone.psyir.nodes import Loop
from psyclone.psyir.nodes import Container, Loop, Routine


def trans(psy):
''' Transformation entry point '''
def trans(psyir: Container) -> None:
'''
Transformation entry point.

'''
config = Config.get()
tinfo = TransInfo()
parallel_loop_trans = tinfo.get_trans_name('GOceanOMPParallelLoopTrans')
loop_trans = tinfo.get_trans_name('GOceanOMPLoopTrans')
parallel_trans = tinfo.get_trans_name('OMPParallelTrans')
module_inline_trans = KernelModuleInlineTrans()

schedule = psy.invokes.get('invoke_0').schedule
schedule = psyir.walk(Routine)[0]

# Inline all kernels in this Schedule
for kernel in schedule.kernels():
Expand All @@ -26,15 +29,23 @@ def trans(psy):
# Apply the OpenMPLoop transformation to every child in the schedule or
# OpenMPParallelLoop to every Loop if it has distributed memory.
for child in schedule.children:
# We need to ignore dependencies on '{u,v}a' because PSyclone correctly
# spots that there is a dependence in the bc_flather_{u,v} kernel.
# However, we know that practically this isn't a problem
# because these boundary-condition kernels only update values
# outside the domain.
options = {}
if child.kernels()[0].name == "bc_flather_v_code":
options["ignore_dependencies_for"] = ["va%data"]
if child.kernels()[0].name == "bc_flather_u_code":
options["ignore_dependencies_for"] = ["ua%data"]
if config.distributed_memory:
if isinstance(child, Loop):
parallel_loop_trans.apply(child)
parallel_loop_trans.apply(child, options=options)
else:
loop_trans.apply(child)
loop_trans.apply(child, options=options)

if not config.distributed_memory:
# If it is not distributed memory, enclose all of these loops
# within a single OpenMP PARALLEL region
parallel_trans.apply(schedule.children)

return psy
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
via the -s option. This script module-inline all kernels in the PSy-layer.'''

from psyclone.domain.common.transformations import KernelModuleInlineTrans
from psyclone.psyir.nodes import Node, Routine


def trans(psy):
''' Transformation script entry function '''
def trans(psy: Node):
'''Entry point for PSyIR transformation. This script module-inlines
every user-supplied kernel that is called.

'''
itrans = KernelModuleInlineTrans()

schedule = psy.invokes.get('invoke_0').schedule
schedule = psy.walk(Routine)[0]

# Module-Inline all coded kernels in this Schedule
for kernel in schedule.coded_kernels():
Expand Down
13 changes: 7 additions & 6 deletions benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ DL_TIMER_NAME = libdl_timer_omp.a

# Shorthand for invoking PSyclone with line-length limiting applied
# to the output Fortran.
PSYCLONE = psyclone -api nemo -l output
PSYCLONE = psyclone -l output

# Serial version.
tra_adv_serial: dl_timer
Expand All @@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer
# OpenACC version using Unified Memory with timer around outer loop only.
tra_adv_acc: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
$(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
endif
mkdir -p $@
${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -68,7 +68,7 @@ endif
# Serial Fortran version after transformation to SIR-compliant form.
tra_adv_sir: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \
${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \
./tra_adv_compute_auto_arrays.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -78,7 +78,7 @@ tra_adv_sir: dl_timer
# OpenACC added after transformation to SIR-compliant form.
tra_adv_sir_acc: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \
${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
$(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
endif
mkdir -p $@
${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \
${PSYCLONE} --profile routines -s ../scripts/sir_kernels_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -110,3 +110,4 @@ allclean: clean
rm -rf tra_adv_acc_prof
rm -rf tra_adv_sir
rm -rf tra_adv_sir_acc
rm -rf tra_adv_no_auto_serial
10 changes: 5 additions & 5 deletions benchmarks/nemo/tracer_advection/multi_kernel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer
DL_TIMER_NAME = libdl_timer_omp.a

# Shorthand for invoking PSyclone.
PSYCLONE = psyclone -api nemo -l output
PSYCLONE = psyclone -l output

# Serial version.
tra_adv_serial: dl_timer
Expand All @@ -36,7 +36,7 @@ tra_adv_serial: dl_timer
# OpenACC version with timer around outer loop only.
tra_adv_acc: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -49,7 +49,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
$(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
endif
mkdir -p $@
${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \
${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -59,7 +59,7 @@ endif
# Serial Fortran version after transformation to SIR-compliant form.
tra_adv_sir: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \
${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \
./tra_adv_compute.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand All @@ -69,7 +69,7 @@ tra_adv_sir: dl_timer
# OpenACC added after transformation to SIR-compliant form.
tra_adv_sir_acc: dl_timer
mkdir -p $@
${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \
${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \
$@/tra_adv_compute.f90 ./tra_adv_compute.F90
cp Makefile_gen $@/Makefile
cp tra_adv_driver.F90 $@/.
Expand Down
Loading
Loading