Skip to content
Failed

Console Output

Started by user arnoldg
Running as SYSTEM
Building in workspace /var/lib/jenkins/jobs/mpiGraph/workspace
[SSH] script:
PATH="/sw/tools/bin:/sw/tools/bin:$PATH"
NODELIST=""""
ACCOUNT="bbmb-hydro"
PARTITION="all"
NODES="16"


# tar xvf /var/tmp/mpiGraph.tar  # galen's copy of the benchmark
cd mpiGraph
export PATH=~/mpiGraph:$PATH


# if the NODELIST parameter was filled in, proceed with a targeted test 
if [[ ! -z ${NODELIST} ]]
then

   srun --mem=16g --nodes=${NODES} --ntasks-per-node=1 --time=00:10:00 --job-name=mpiGraph \
      --partition=${PARTITION} --nodelist=${NODELIST} \
      --account=${ACCOUNT} ${GPUS} \
      mpiGraph 1048576 1000 10 > mpiGraph.out
   
   if [ "${NODES}" == "16" ]
   then
      bash clip-plot.sh 16 
   fi

   exit
fi

# otherwise, if NODELIST is "", run the default test on 1st avail nodes

srun --nodes=${NODES}  \
  --mem=16g --ntasks-per-node=1 --time=00:10:00 --job-name=mpiGraph \
  --account=${ACCOUNT} \
  --partition=${PARTITION}  \
 mpiGraph 1048576 1000 10 > mpiGraph.out

 if [ "${NODES}" == "16" ]
 then
      bash clip-plot.sh 16 
 fi

[SSH] executing...
[hydro61:1024441:0:1024441] rc_verbs_iface.c:124  send completion with error: transport retry counter exceeded [qpn 0x7d64 wrid 0x1vendor_err 0x81]
[hydro61:1024441:0:1024441] rc_verbs_iface.c:124  [rqpn 0x22b dlid=0 sl=0 port=1 src_path_bits=0 dgid=::ffff:172.24.192.107 sgid_index=3 traffic_class=0]
==== backtrace (tid:1024441) ====
 0  /usr/lib64/libucs.so.0(ucs_handle_error+0x2dc) [0x14b53c099edc]
 1  /usr/lib64/libucs.so.0(ucs_fatal_error_message+0xb1) [0x14b53c096d41]
 2  /usr/lib64/libucs.so.0(ucs_log_default_handler+0xde4) [0x14b53c09b6a4]
 3  /usr/lib64/libucs.so.0(ucs_log_dispatch+0xe4) [0x14b53c09b9c4]
 4  /usr/lib64/ucx/libuct_ib.so.0(+0x31d54) [0x14b530f18d54]
 5  /usr/lib64/ucx/libuct_ib.so.0(+0x32217) [0x14b530f19217]
 6  /usr/lib64/libucp.so.0(ucp_worker_progress+0x6a) [0x14b53c76182a]
 7  /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libopen-pal.so.40(opal_progress+0x2c) [0x14b53ca1ebec]
 8  /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libmpi.so.40(ompi_request_default_test_all+0x143) [0x14b53d694bf3]
 9  /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libmpi.so.40(PMPI_Testall+0xab) [0x14b53d6d144b]
10  /u/svchydrojenkins/mpiGraph/mpiGraph() [0x40122a]
11  /u/svchydrojenkins/mpiGraph/mpiGraph() [0x401d78]
12  /usr/lib64/libc.so.6(__libc_start_main+0xf3) [0x14b53d062cf3]
13  /u/svchydrojenkins/mpiGraph/mpiGraph() [0x400d5e]
=================================
[hydro61:1024441] *** Process received signal ***
[hydro61:1024441] Signal: Aborted (6)
[hydro61:1024441] Signal code:  (-6)
[hydro61:1024441] [ 0] /usr/lib64/libpthread.so.0(+0x12ce0)[0x14b53d3ffce0]
[hydro61:1024441] [ 1] /usr/lib64/libc.so.6(gsignal+0x10f)[0x14b53d076a9f]
[hydro61:1024441] [ 2] /usr/lib64/libc.so.6(abort+0x127)[0x14b53d049e05]
[hydro61:1024441] [ 3] /usr/lib64/libucs.so.0(+0x27d46)[0x14b53c096d46]
[hydro61:1024441] [ 4] /usr/lib64/libucs.so.0(ucs_log_default_handler+0xde4)[0x14b53c09b6a4]
[hydro61:1024441] [ 5] /usr/lib64/libucs.so.0(ucs_log_dispatch+0xe4)[0x14b53c09b9c4]
[hydro61:1024441] [ 6] /usr/lib64/ucx/libuct_ib.so.0(+0x31d54)[0x14b530f18d54]
[hydro61:1024441] [ 7] /usr/lib64/ucx/libuct_ib.so.0(+0x32217)[0x14b530f19217]
[hydro61:1024441] [ 8] /usr/lib64/libucp.so.0(ucp_worker_progress+0x6a)[0x14b53c76182a]
[hydro61:1024441] [ 9] /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libopen-pal.so.40(opal_progress+0x2c)[0x14b53ca1ebec]
[hydro61:1024441] [10] /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libmpi.so.40(ompi_request_default_test_all+0x143)[0x14b53d694bf3]
[hydro61:1024441] [11] /sw/spack/hydrogpu-2022-06/apps/openmpi/4.1.4-gcc-11.3.0-i6koipj/lib/libmpi.so.40(PMPI_Testall+0xab)[0x14b53d6d144b]
[hydro61:1024441] [12] /u/svchydrojenkins/mpiGraph/mpiGraph[0x40122a]
[hydro61:1024441] [13] /u/svchydrojenkins/mpiGraph/mpiGraph[0x401d78]
[hydro61:1024441] [14] /usr/lib64/libc.so.6(__libc_start_main+0xf3)[0x14b53d062cf3]
[hydro61:1024441] [15] /u/svchydrojenkins/mpiGraph/mpiGraph[0x400d5e]
[hydro61:1024441] *** End of error message ***
srun: error: hydro61: task 7: Aborted
[hydro01:431518] *** An error occurred in MPI_Testall
[hydro01:431518] *** reported by process [4009049243,0]
[hydro01:431518] *** on communicator MPI_COMM_WORLD
[hydro01:431518] *** MPI_ERR_INTERN: internal error
[hydro01:431518] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[hydro01:431518] ***    and potentially your MPI job)
slurmstepd: error: *** STEP 23937.0 ON hydro01 CANCELLED AT 2023-05-05T13:28:34 ***
srun: Job step aborted: Waiting up to 32 seconds for job step to finish.
srun: error: hydro07: task 6: Killed
srun: error: hydro03: task 2: Killed
srun: error: hydro02: task 1: Killed
srun: error: hydro06: task 5: Killed
srun: error: hydro01: task 0: Exited with exit code 17
srun: error: hydro04: task 3: Killed
srun: error: hydro05: task 4: Killed
srun: error: hydro65: task 11: Killed
srun: error: hydro62: task 8: Killed
srun: error: hydro64: task 10: Killed
srun: error: hydro67: task 13: Killed
srun: error: hydro68: task 14: Killed
srun: error: hydro63: task 9: Killed
srun: error: hydro66: task 12: Killed
srun: error: hydro69: task 15: Killed
NODECOUNT=$1
+ NODECOUNT=16

# clip out the Send table and plot it
egrep --after-context=$NODECOUNT "^Send\s+hy" mpiGraph.out \
   | sed -e s/.to//g > send.txt 
+ egrep --after-context=16 '^Send\s+hy' mpiGraph.out
+ sed -e s/.to//g
python3 send.py
+ python3 send.py
Traceback (most recent call last):
  File "/u/svchydrojenkins/mpiGraph/send.py", line 9, in <module>
    df = pd.read_csv("send.txt", sep = "\t")
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 575, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 933, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1235, in _make_engine
    return mapping[engine](f, **self.options)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 75, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx", line 551, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file

# clip out the Recv table and plot it
egrep --after-context=$NODECOUNT "^Recv\s+hy" mpiGraph.out \
   | sed -e s/.from//g > recv.txt 
+ egrep --after-context=16 '^Recv\s+hy' mpiGraph.out
+ sed -e s/.from//g
python3 recv.py
+ python3 recv.py
Traceback (most recent call last):
  File "/u/svchydrojenkins/mpiGraph/recv.py", line 9, in <module>
    df = pd.read_csv("recv.txt", sep = "\t")
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/util/_decorators.py", line 311, in wrapper
    return func(*args, **kwargs)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 575, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 933, in __init__
    self._engine = self._make_engine(f, self.engine)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/readers.py", line 1235, in _make_engine
    return mapping[engine](f, **self.options)
  File "/sw/external/python/anaconda3_gpu/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 75, in __init__
    self._reader = parsers.TextReader(src, **kwds)
  File "pandas/_libs/parsers.pyx", line 551, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file

[SSH] completed
[SSH] exit-status: 1

Build step 'Execute shell script on remote host using ssh' marked build as failure
Finished: FAILURE