Skip to content
Failed

Console Output

Started by user Greg Bauer
Running as SYSTEM
Building in workspace /var/lib/jenkins/jobs/mpiGraph/workspace
[SSH] script:
PATH="/sw/tools/bin:/sw/tools/bin:$PATH"
NODELIST=""""
ACCOUNT="bbmb-hydro"
PARTITION="sandybridge"
NODES="16"


# tar xvf /var/tmp/mpiGraph.tar  # galen's copy of the benchmark
cd mpiGraph
export PATH=~/mpiGraph:$PATH


# if the NODELIST parameter was filled in, proceed with a targeted test 
if [[ ! -z ${NODELIST} ]]
then

   srun --mem=16g --nodes=${NODES} --ntasks-per-node=1 --time=00:10:00 --job-name=mpiGraph \
      --partition=${PARTITION} --nodelist=${NODELIST} \
      --account=${ACCOUNT} ${GPUS} \
      mpiGraph 1048576 1000 10 > mpiGraph.out
   
   if [ "${NODES}" == "16" ]
   then
      bash clip-plot.sh 16 
   fi

   exit
fi

# otherwise, if NODELIST is "", run the default test on 1st avail nodes

srun --nodes=${NODES}  \
  --mem=16g --ntasks-per-node=1 --time=00:10:00 --job-name=mpiGraph \
  --account=${ACCOUNT} \
  --partition=${PARTITION}  \
 mpiGraph 1048576 1000 10 > mpiGraph.out

 if [ "${NODES}" == "16" ]
 then
      bash clip-plot.sh 16 
 fi

[SSH] executing...
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error: slurm_receive_msgs: [[hydro31]:6818] failed: Zero Bytes were transmitted or received
slurmstepd: error:  mpi/pmix_v4: pmixp_p2p_send: hydro30 [14]: pmixp_utils.c:467: send failed, rc=1001, exceeded the retry limit
slurmstepd: error:  mpi/pmix_v4: _slurm_send: hydro30 [14]: pmixp_server.c:1583: Cannot send message to /var/spool/slurmd/stepd.slurm.pmix.24259.0, size = 458, hostlist:
(null)
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_reset_if_to: hydro16 [1]: pmixp_coll_ring.c:742: 0x151000033d20: collective timeout seq=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_log: hydro16 [1]: pmixp_coll.c:281: Dumping collective state
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:760: 0x151000033d20: COLL_FENCE_RING state seq=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:762: my peerid: 1:hydro16
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:769: neighbor id: next 2:hydro17, prev 0:hydro15
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:779: Context ptr=0x151000033d98, #0, in-use=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:779: Context ptr=0x151000033dd0, #1, in-use=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:779: Context ptr=0x151000033e08, #2, in-use=1
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:790: 	 seq=0 contribs: loc=1/prev=2/fwd=3
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:792: 	 neighbor contribs [16]:
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:825: 		 done contrib: hydro[15,31]
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:827: 		 wait contrib: hydro[17-20,22-30]
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:829: 	 status=PMIXP_COLL_RING_PROGRESS
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro16 [1]: pmixp_coll_ring.c:833: 	 buf (offset/size): 1101/6861
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_reset_if_to: hydro23 [7]: pmixp_coll_ring.c:742: 0x1455ec033b80: collective timeout seq=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_log: hydro23 [7]: pmixp_coll.c:281: Dumping collective state
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:760: 0x1455ec033b80: COLL_FENCE_RING state seq=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:762: my peerid: 7:hydro23
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:769: neighbor id: next 8:hydro24, prev 6:hydro22
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:779: Context ptr=0x1455ec033bf8, #0, in-use=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:779: Context ptr=0x1455ec033c30, #1, in-use=0
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:779: Context ptr=0x1455ec033c68, #2, in-use=1
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:790: 	 seq=0 contribs: loc=1/prev=8/fwd=9
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:792: 	 neighbor contribs [16]:
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:825: 		 done contrib: hydro[15-20,22,31]
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:827: 		 wait contrib: hydro[24-30]
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:829: 	 status=PMIXP_COLL_RING_PROGRESS
slurmstepd: error:  mpi/pmix_v4: pmixp_coll_ring_log: hydro23 [7]: pmixp_coll_ring.c:833: 	 buf (offset/size): 3261/9021
[hydro16:505984] pml_ucx.c:178  Error: Failed to receive UCX worker address: Not found (-13)
[hydro16:505984] pml_ucx.c:472  Error: Failed to resolve UCX endpoint for rank 0
[hydro23:491271] pml_ucx.c:178  Error: Failed to receive UCX worker address: Not found (-13)
[hydro23:491271] pml_ucx.c:472  Error: Failed to resolve UCX endpoint for rank 6
[hydro16:505984] *** An error occurred in MPI_Gather
[hydro16:505984] *** reported by process [4222097330,1]
[hydro16:505984] *** on communicator MPI_COMM_WORLD
[hydro16:505984] *** MPI_ERR_OTHER: known error not in list
[hydro16:505984] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[hydro16:505984] ***    and potentially your MPI job)
[hydro23:491271] *** An error occurred in MPI_Gather
[hydro23:491271] *** reported by process [4222097330,7]
[hydro23:491271] *** on communicator MPI_COMM_WORLD
[hydro23:491271] *** MPI_ERR_OTHER: known error not in list
[hydro23:491271] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[hydro23:491271] ***    and potentially your MPI job)
slurmstepd: error: *** STEP 24259.0 ON hydro15 CANCELLED AT 2023-05-11T09:29:32 ***
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: error: hydro16: task 1: Exited with exit code 16
srun: error: hydro23: task 7: Exited with exit code 16
srun: error: hydro30: task 14: Killed
srun: error: hydro17: task 2: Killed
srun: error: hydro18: task 3: Killed
srun: error: hydro29: task 13: Killed
srun: error: hydro26: task 10: Killed
srun: error: hydro19: task 4: Killed
srun: error: hydro31: task 15: Killed
srun: error: hydro15: task 0: Killed
srun: error: hydro22: task 6: Killed
srun: error: hydro20: task 5: Killed
srun: error: hydro27: task 11: Killed
srun: error: hydro24: task 8: Killed
srun: error: hydro28: task 12: Killed
srun: error: hydro25: task 9: Killed
NODECOUNT=$1
+ NODECOUNT=16

# clip out the Send table and plot it
egrep --after-context=$NODECOUNT "^Send\s+hy" mpiGraph.out \
   | sed -e s/.to//g > send.txt 
+ egrep --after-context=16 '^Send\s+hy' mpiGraph.out
+ sed -e s/.to//g
python3 send.py
+ python3 send.py
Traceback (most recent call last):
  File "/u/svchydrojenkins/mpiGraph/send.py", line 9, in <module>
    df = pd.read_csv("send.txt", sep = "\t")
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 575, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 933, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1235, in _make_engine
    return mapping[engine](f, **self.options)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 75, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx", line 551, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file

# clip out the Recv table and plot it
egrep --after-context=$NODECOUNT "^Recv\s+hy" mpiGraph.out \
   | sed -e s/.from//g > recv.txt 
+ egrep --after-context=16 '^Recv\s+hy' mpiGraph.out
+ sed -e s/.from//g
python3 recv.py
+ python3 recv.py
Traceback (most recent call last):
  File "/u/svchydrojenkins/mpiGraph/recv.py", line 9, in <module>
    df = pd.read_csv("recv.txt", sep = "\t")
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 575, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 933, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1235, in _make_engine
    return mapping[engine](f, **self.options)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 75, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx", line 551, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file

[SSH] completed
[SSH] exit-status: 1

Build step 'Execute shell script on remote host using ssh' marked build as failure
Finished: FAILURE