Skip to content
Failed

Console Output

Started by timer
Running as SYSTEM
Building in workspace /var/lib/jenkins/jobs/cuda_bw/workspace
[SSH] script:
PATH="/sw/tools/bin:/sw/tools/bin:$PATH"
NODELIST=""""

mkdir -p ~svchydrojenkins/hydro/cuda
cd ~svchydrojenkins/hydro/cuda
pwd
rm -f *

#export PATH=$PATH:/usr/local/cuda/bin
# new hydro gpu sw stack testing
#module use /hysw/spack/hydrogpu-2022-06/modules/lmod/Core
module list
which nvcc

export CUDASAMPLES=~svchydrojenkins/hydro_sw_testing/gpu/cuda/cuda-samples
cp $CUDASAMPLES/Samples/0_Introduction/matrixMul/matrixMul.cu  ./
cp $CUDASAMPLES/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu  ./
nvcc -I $CUDASAMPLES/Common -o matrixMul matrixMul.cu
if [[ -x "matrixMul" ]]
then
   echo "matrixMul appears to have built ok"
else
   echo "#### matrixMul is missing or not built ###############"
   exit -1
fi
nvcc -I $CUDASAMPLES/Common -o bandwidthTest bandwidthTest.cu
pwd
ls -lt

# if the NODELIST parameter was filled in, proceed with a targeted test, 1 cores , graph inputs in custom.txt
if [[ ! -z ${NODELIST} ]]
then
   rm -f stream_*.txt
   echo "YVALUE=1" > stream_triad.txt
   echo "YVALUE=1" > stream_gpu56c_triad.txt
   echo "YVALUE=1" > stream_gpu64c_triad.txt
   # for a single node string, it's easy enough to calculate a matching partition:
   PARTITION=`sinfo --format="%R,%N" -n ${NODELIST}  | grep ${NODELIST}  | cut -d',' -f1 | tail -1`
   echo "PARTITION=${PARTITION}"
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  nsys profile --gpu-metrics-device=all  ./bandwidthTest --htod
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ./matrixMul
dcgmi profile --pause
   srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ncu --metrics "regex:.*"  --target-processes all ./matrixMul
dcgmi profile --resume
   GFLOPS=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  ./matrixMul | \
      grep Performance= | cut -d"=" -f2 | cut -d"G" -f1`
   echo GFLOPS=$GFLOPS
   echo "YVALUE=$GFLOPS" > gflops.txt
   BW=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 --nodelist=${NODELIST}  \
      ./bandwidthTest --htod | grep 32000000`
   BW=`echo BW=$BW | cut -d' ' -f3`
   echo BW=$BW
   echo "YVALUE=$BW" > bw.txt
   nvcc --version > mvcc_version.txt
   exit
fi

# otherwise, if NODELIST is "" :
 PARTITION=a100
 srun --partition=${PARTITION} --time=00:10:00 --ntasks=1  --gpus-per-node=1  ./bandwidthTest --htod
   if [ $? -eq 0 ]
   then 
      echo "slurm and srun ran successfully"
   else
      echo "srun failed"
      exit -1
   fi
 srun --partition=${PARTITION} --time=00:10:00 --ntasks=1  --gpus-per-node=1  ./matrixMul
 GFLOPS=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1 ./matrixMul | \
      grep Performance= | cut -d"=" -f2 | cut -d"G" -f1`
 echo GFLOPS=$GFLOPS
 echo "YVALUE=$GFLOPS" > gflops.txt
 BW=`srun --partition=${PARTITION} --time=00:10:00 --ntasks=1 --gpus-per-node=1  \
      ./bandwidthTest --htod | grep 32000000`
 BW=`echo BW=$BW | cut -d' ' -f3`
 echo BW=$BW
 echo "YVALUE=$BW" > bw.txt

nvcc --version > nvcc_version.txt
exit




[SSH] executing...
[SSH] Exception:Auth fail for methods 'publickey'
com.jcraft.jsch.JSchException: Auth fail for methods 'publickey'
	at PluginClassLoader for jsch//com.jcraft.jsch.Session.connect(Session.java:521)
	at PluginClassLoader for ssh//org.jvnet.hudson.plugins.CredentialsSSHSite.createSession(CredentialsSSHSite.java:132)
	at PluginClassLoader for ssh//org.jvnet.hudson.plugins.CredentialsSSHSite.executeCommand(CredentialsSSHSite.java:208)
	at PluginClassLoader for ssh//org.jvnet.hudson.plugins.SSHBuilder.perform(SSHBuilder.java:104)
	at hudson.tasks.BuildStepMonitor$1.perform(BuildStepMonitor.java:20)
	at hudson.model.AbstractBuild$AbstractBuildExecution.perform(AbstractBuild.java:818)
	at hudson.model.Build$BuildExecution.build(Build.java:199)
	at hudson.model.Build$BuildExecution.doRun(Build.java:164)
	at hudson.model.AbstractBuild$AbstractBuildExecution.run(AbstractBuild.java:526)
	at hudson.model.Run.execute(Run.java:1894)
	at hudson.model.FreeStyleBuild.run(FreeStyleBuild.java:44)
	at hudson.model.ResourceController.execute(ResourceController.java:101)
	at hudson.model.Executor.run(Executor.java:446)
Build step 'Execute shell script on remote host using ssh' marked build as failure
Recording plot data
Saving plot series data from: /var/lib/jenkins/jobs/cuda_bw/workspace/gflops.txt
Saving plot series data from: /var/lib/jenkins/jobs/cuda_bw/workspace/bw.txt
Sending e-mails to: [email protected]
Finished: FAILURE