Skip to content
Success

Console Output

Started by timer
Running as SYSTEM
Building in workspace /var/lib/jenkins/jobs/pytorch_train/workspace
[SSH] script:
TARGETNODE=""""

module load anaconda3_gpu/4.13.0
module load cuda/11.7.0

cd pytorch_train
rm -f train_results_jenkins.csv

# Slurm Arguments
sargs="--nodes=1 "
sargs+="--ntasks-per-node=1 "
sargs+="--mem=16g "
sargs+="--time=00:10:00 "
sargs+="--account=bbmb-hydro "
sargs+="--gpus-per-node=1 "
sargs+="--gpu-bind=closest "
# Add Target node if it exists
if [[ ! -z ${TARGETNODE} ]]
then
    PARTITION=`sinfo --format="%R,%N" -n hydro61  | grep hydro61  | cut -d',' -f1 | tail -1`
    sargs+="--partition=${PARTITION} "
    sargs+="--nodelist=${TARGETNODE} "
else
    sargs+="--partition=a100 "
fi
# Executable to run
scmd="python train.py | tee time.txt"

# Run the command
start_time=`date +%s.%N`
echo $"Starting srun with command"
echo "srun $sargs $scmd"
srun $sargs $scmd
end_time=`date +%s.%N`

runtime=$( echo "$end_time - $start_time" | bc -l )
echo "YVALUE=$runtime" > time.txt
printf "Pytorch test completed in %0.3f secs\n" $runtime

[SSH] executing...
Starting srun with command
srun --nodes=1 --ntasks-per-node=1 --mem=16g --time=00:10:00 --account=bbmb-hydro --gpus-per-node=1 --gpu-bind=closest --partition=a100  python train.py | tee time.txt
srun: job 98292 queued and waiting for resources
srun: job 98292 has been allocated resources
slurmstepd: error: execve(): python: No such file or directory
srun: error: hydro04: task 0: Exited with exit code 2
Pytorch test completed in 0.606 secs

[SSH] completed
[SSH] exit-status: 0

[workspace] $ /bin/sh -xe /tmp/jenkins351450810407367193.sh
+ scp 'HYDRO_REMOTE:~svchydrojenkins/pytorch_train/time.txt' /var/lib/jenkins/jobs/pytorch_train/workspace
Recording plot data
Saving plot series data from: /var/lib/jenkins/jobs/pytorch_train/workspace/time.txt
Finished: SUCCESS