Adastra MI250X setup (LLVM)¶
Compiler setup¶
Compiling the compiler¶
To setup LLVM on adastra :
module purge
module load PrgEnv-amd
module load cray-python
module load CCE-GPU-2.1.0
module load rocm/5.7.1 # 5.5.1 -> 5.7.1
# to get cmake and ninja
pip3 install -U cmake ninja
export PATH=$HOMEDIR/.local/bin:$PATH
# do everything in scratch
# The directory to use for the build
# get a shallow clone
git clone --depth 1 intel-llvm-git
cd $WORKDIR/intel-llvm-git
# configure llvm
python3 buildbot/ --hip --cmake-opt="-DCMAKE_INSTALL_PREFIX=$WORKDIR/intel_llvm" --cmake-opt="-DSYCL_BUILD_PI_HIP_ROCM_DIR=$ROCM_PATH" --cmake-gen "Ninja"
cd $WORKDIR/intel-llvm-git/build
ninja all
ninja all lib/all tools/libdevice/libsycldevice
ninja install
Testing the compiler¶
Just write this in an exemple file :
test.cpp | |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
export PATH=$HOMEDIR/.local/bin:$PATH
export LLVM_HOME=$SCRATCHDIR/intel_llvm/
echo "Intel LLVM dir :" $LLVM_HOME
export PATH=$LLVM_HOME/bin:$PATH
clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a --rocm-path=/opt/rocm-5.7.1 test.cpp
Allocate some time on the cluster to check if everything works, it should print the device name and 999, 8 times.
salloc -A cad14954 -N 1 -C "MI250" --job-name=interactive --time=100 --exclusive
srun --ntasks-per-node=8 --cpus-per-task=8 --threads-per-core=1 --gpu-bind=closest -- ./a.out
Compiling Shamrock¶
Load the modules to compile Shamrock on adastra :
module purge
module load cpe/23.12
module load craype-accel-amd-gfx90a craype-x86-trento
module load PrgEnv-intel
module load cray-mpich/8.1.26
module load cray-python
module load amd-mixed/5.7.1
module load rocm/5.7.1
Before running anything check if you have done the following commands. If not the path to the compiler & python tools we have installed earlier will not be available
export PATH=$HOMEDIR/.local/bin:$PATH
export LLVM_HOME=$SCRATCHDIR/intel_llvm/
echo "Intel LLVM dir :" $LLVM_HOME
export PATH=$LLVM_HOME/bin:$PATH
cd Shamrock
cmake -S . -B build -G "Ninja" -DSYCL_IMPLEMENTATION=IntelLLVM -DCMAKE_CXX_COMPILER=$SCRATCHDIR/intel_llvm/bin/clang++ -DSHAMROCK_ENABLE_BACKEND=SYCL -DINTEL_LLVM_PATH=$SCRATCHDIR/intel_llvm -DCMAKE_C_COMPILER=$SCRATCHDIR/intel_llvm/bin/clang-19 -DCMAKE_CXX_FLAGS="-fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx90a --rocm-path=${ROCM_PATH} -I${MPICH_DIR}/include -L${MPICH_DIR}/lib -lmpi ${PE_MPICH_GTL_DIR_amd_gfx90a} ${PE_MPICH_GTL_LIBS_amd_gfx90a}" -DBUILD_TEST=true -DCXX_FLAG_ARCH_NATIVE=off
Slurm scripts :¶
Slurm script exemple :
slurm_script | |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
#SBATCH --account=cad14954
#SBATCH --job-name=ShamrockScalling
#SBATCH --constraint=MI250
#SBATCH --nodes=1
#SBATCH --exclusive
#SBATCH --output=%A.out
#SBATCH --time=0:01:00
echo "The job ${SLURM_JOB_ID} is running on these nodes:"
cd $HOMEDIR/Shamrock/build
module purge
module load cpe/23.12
module load craype-accel-amd-gfx90a craype-x86-trento
module load PrgEnv-intel
module load cray-mpich/8.1.26
module load cray-python
module load amd-mixed/5.7.1
export LLVM_HOME=$HOMEDIR/intel_llvm
echo "Intel LLVM dir :" $LLVM_HOME
export PATH=$LLVM_HOME/bin:$PATH
ldd ./shamrock
srun --ntasks-per-node=8 --cpus-per-task=8 --threads-per-core=1 --gpu-bind=closest -- \
./shamrock --sycl-cfg auto:HIP --loglevel 125 --sycl-ls-map \
swap mystique function Adastra_MI250_8TasksWith8ThreadsAnd1GPU() { # Node local rank 0 gets the GCD 0, is bound the cores [48-55] of NUMA domain 3 and uses the NIC 0 # Node local rank 1 gets the GCD 1, is bound the cores [56-63] of NUMA domain 3 and uses the NIC 0 # Node local rank 2 gets the GCD 2, is bound the cores [16-23] of NUMA domain 1 and uses the NIC 1 # Node local rank 3 gets the GCD 3, is bound the cores [24-31] of NUMA domain 1 and uses the NIC 1 # Node local rank 4 gets the GCD 4, is bound the cores [ 0- 7] of NUMA domain 0 and uses the NIC 2 # Node local rank 5 gets the GCD 5, is bound the cores [ 8-15] of NUMA domain 0 and uses the NIC 2 # Node local rank 6 gets the GCD 6, is bound the cores [32-39] of NUMA domain 2 and uses the NIC 3 # Node local rank 7 gets the GCD 7, is bound the cores [40-47] of NUMA domain 2 and uses the NIC 3 AFFINITY_NUMACTL=('48-55' '56-63' '16-23' '24-31' '0-7' '8-15' '32-39' '40-47') #AFFINITY_NUMACTL=('0-7' '8-15' '16-23' '24-31' '32-39' '40-47' '48-55' '56-63') AFFINITY_GPU=('0' '1' '2' '3' '4' '5' '6' '7') export MPICH_OFI_NIC_POLICY=GPU }
function Adastra_MI250_8TasksWith8ThreadsAnd1GPU() { # Node local rank 0 gets the GCD 0, is bound the cores [48-55] of NUMA domain 3 and uses the NIC 0 # Node local rank 1 gets the GCD 1, is bound the cores [56-63] of NUMA domain 3 and uses the NIC 0 # Node local rank 2 gets the GCD 2, is bound the cores [16-23] of NUMA domain 1 and uses the NIC 1 # Node local rank 3 gets the GCD 3, is bound the cores [24-31] of NUMA domain 1 and uses the NIC 1 # Node local rank 4 gets the GCD 4, is bound the cores [ 0- 7] of NUMA domain 0 and uses the NIC 2 # Node local rank 5 gets the GCD 5, is bound the cores [ 8-15] of NUMA domain 0 and uses the NIC 2 # Node local rank 6 gets the GCD 6, is bound the cores [32-39] of NUMA domain 2 and uses the NIC 3 # Node local rank 7 gets the GCD 7, is bound the cores [40-47] of NUMA domain 2 and uses the NIC 3 AFFINITY_NUMACTL=('40-47' '56-63' '16-23' '24-31' '0-7' '8-15' '32-39' '48-55') #AFFINITY_NUMACTL=('0-7' '8-15' '16-23' '24-31' '32-39' '40-47' '48-55' '56-63') AFFINITY_GPU=('0' '1' '2' '3' '4' '5' '6' '7') export MPICH_OFI_NIC_POLICY=NUMA }
