Console Output

Started by timer
Running as SYSTEM
Building in workspace /var/lib/jenkins/jobs/cuda_bw/workspace
[SSH] script:
PATH="/sw/tools/bin:/sw/tools/bin:$PATH"
NODELIST=""""

mkdir -p ~svchydrojenkins/hydro/cuda
cd ~svchydrojenkins/hydro/cuda
pwd
rm -f *

#export PATH=$PATH:/usr/local/cuda/bin
# new hydro gpu sw stack testing
#module use /hysw/spack/hydrogpu-2022-06/modules/lmod/Core
module list
which nvcc

export CUDASAMPLES=~svchydrojenkins/hydro_sw_testing/gpu/cuda/cuda-samples
cp $CUDASAMPLES/Samples/0_Introduction/matrixMul/matrixMul.cu  ./
cp $CUDASAMPLES/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu  ./
nvcc -I $CUDASAMPLES/Common -o matrixMul matrixMul.cu
if [[ -x "matrixMul" ]]
then
   echo "matrixMul appears to have built ok"
else
   echo "#### matrixMul is missing or not built ###############"
   exit -1
fi
nvcc -I $CUDASAMPLES/Common -o bandwidthTest bandwidthTest.cu
pwd
ls -lt

# if the NODELIST parameter was filled in, proceed with a targeted test, 1 cores , graph inputs in custom.txt
if [[ ! -z ${NODELIST} ]]
then
   rm -f stream_*.txt
   echo "YVALUE=1" > stream_triad.txt
   echo "YVALUE=1" > stream_gpu56c_triad.txt
   echo "YVALUE=1" > stream_gpu64c_triad.txt
   # for a single node string, it's easy enough to calculate a matching partition:
   PARTITION=`sinfo --format="%R,%N" -n ${NODELIST}  | grep ${NODELIST}  | cut -d',' -f1 | tail -1`
   echo "PARTITION=${PARTITION}"
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  nsys profile --gpu-metrics-device=all  ./bandwidthTest --htod
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ./matrixMul
dcgmi profile --pause
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ncu --metrics "regex:.*"  --target-processes all ./matrixMul
dcgmi profile --resume
   GFLOPS=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ./matrixMul | \
      grep Performance= | cut -d"=" -f2 | cut -d"G" -f1`
   echo GFLOPS=$GFLOPS
   echo "YVALUE=$GFLOPS" > gflops.txt
   BW=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  \
      ./bandwidthTest --htod | grep 32000000`
   BW=`echo BW=$BW | cut -d' ' -f3`
   echo BW=$BW
   echo "YVALUE=$BW" > bw.txt
   nvcc --version > mvcc_version.txt
   exit
fi

# otherwise, if NODELIST is "" :
 PARTITION=a100
 srun --partition=${PARTITION} --time=00:10:00 --ntasks=1  --gpus-per-node=1  ./bandwidthTest --htod
   if [ $? -eq 0 ]
   then 
      echo "slurm and srun ran successfully"
   else
      echo "srun failed"
      exit -1
   fi
 srun --partition=${PARTITION} --time=00:10:00 --ntasks=1  --gpus-per-node=1  ./matrixMul
 GFLOPS=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 ./matrixMul | \
      grep Performance= | cut -d"=" -f2 | cut -d"G" -f1`
 echo GFLOPS=$GFLOPS
 echo "YVALUE=$GFLOPS" > gflops.txt
 BW=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1  \
      ./bandwidthTest --htod | grep 32000000`
 BW=`echo BW=$BW | cut -d' ' -f3`
 echo BW=$BW
 echo "YVALUE=$BW" > bw.txt

nvcc --version > nvcc_version.txt
exit




[SSH] executing...
/u/svchydrojenkins/hydro/cuda

Currently Loaded Modules:
  1) gcc/11.3.0      3) cuda/11.7.0   5) scripts/script_paths   7) StdEnv
  2) openmpi/4.1.4   4) modtree/gpu   6) user/license_file

 

/sw/spack/hydrogpu-2022-06/apps/cuda/11.7.0-gcc-11.3.0-3ysno6b/bin/nvcc
matrixMul appears to have built ok
/u/svchydrojenkins/hydro/cuda
total 1658
-rwx------ 1 svchydrojenkins grp_202 825104 Nov 16  2024 bandwidthTest
-rwx------ 1 svchydrojenkins grp_202 826712 Nov 16 14:29 matrixMul
-rwx------ 1 svchydrojenkins grp_202  32962 Nov 16 14:29 bandwidthTest.cu
-rwx------ 1 svchydrojenkins grp_202  11783 Nov 16 14:29 matrixMul.cu
srun: job 97720 queued and waiting for resources
srun: job 97720 has been allocated resources
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: NVIDIA A100 80GB PCIe
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)	Bandwidth(GB/s)
   32000000			24.7

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
slurm and srun ran successfully
srun: job 97721 queued and waiting for resources
srun: job 97721 has been allocated resources
[Matrix Multiply Using CUDA] - Starting...
GPU Device 0: "Ampere" with compute capability 8.0

MatrixA(320,320), MatrixB(640,320)
Computing result using CUDA Kernel...
done
Performance= 4340.45 GFlop/s, Time= 0.030 msec, Size= 131072000 Ops, WorkgroupSize= 1024 threads/block
Checking computed result for correctness: Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
srun: job 97722 queued and waiting for resources
srun: job 97722 has been allocated resources
GFLOPS= 4356.21
srun: job 97723 queued and waiting for resources
srun: job 97723 has been allocated resources
BW=24.6

[SSH] completed
[SSH] exit-status: 0

[workspace] $ /bin/sh -xe /tmp/jenkins9893228767696482167.sh
+ scp 'HYDRO_REMOTE:~svchydrojenkins/hydro/cuda/*.txt' /var/lib/jenkins/jobs/cuda_bw/workspace
Recording plot data
Saving plot series data from: /var/lib/jenkins/jobs/cuda_bw/workspace/gflops.txt
Saving plot series data from: /var/lib/jenkins/jobs/cuda_bw/workspace/bw.txt
Finished: SUCCESS