Bootstrap: docker
From: nvidia/cuda:12.4.1-devel-ubuntu22.04
%labels
    Author      Bradley A. A. Martin
    Target      UCL Young (A100-SXM4-40GB, Sun Grid Engine)
    Description LAMMPS (stable_22Jul2025) + KOKKOS CUDA (sm80/A100) + ML-IAP + MACE + UCX + OpenMPI for Young SGE mpirun
    LAMMPS      stable_22Jul2025
    CUDA_Base   12.4.1
    Torch       2.6.0+cu124
    cuEq        0.7.0
    MACE        0.3.14
    UCX         1.16.0
    OpenMPI     4.1.6
%environment
    # Prefixes
    export UCX_PREFIX=/opt/ucx
    export OMPI_PREFIX=/opt/openmpi
    export LAMMPS_PREFIX=/opt/lammps
    # Paths
    export PATH=${LAMMPS_PREFIX}/bin:${OMPI_PREFIX}/bin:${UCX_PREFIX}/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
    export LD_LIBRARY_PATH=/.singularity.d/libs:${LAMMPS_PREFIX}/lib:${OMPI_PREFIX}/lib:${UCX_PREFIX}/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}
    # Threading defaults (SGE jobs can override explicitly)
    export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1}
    export PYTHONUNBUFFERED=1
    # Prefer UCX for MPI transports; avoid smcuda
    export OMPI_MCA_pml=ucx
    export OMPI_MCA_osc=ucx
    export OMPI_MCA_btl=^smcuda
    # Optional: silence UCX unused env warnings
    export UCX_WARN_UNUSED_ENV_VARS=n
%post
    set -eu
    export DEBIAN_FRONTEND=noninteractive
    echo "=== [Base] System deps ==="
    apt-get update
    apt-get install -y --no-install-recommends \
        python3 python3-pip python3-dev python3-venv python3-setuptools python3-wheel python3-distutils \
        build-essential git cmake ninja-build pkg-config \
        wget curl ca-certificates \
        m4 autoconf automake libtool flex bison make \
        libfftw3-dev libssl-dev zlib1g-dev libffi-dev \
        libblas-dev liblapack-dev gfortran \
        libjpeg-dev libpng-dev \
        libreadline-dev \
        openssh-client \
        vim less \
        libhwloc-dev libevent-dev libnuma-dev \
        uidmap \
        libibverbs-dev librdmacm-dev \
        libucx0 ucx-utils libucx-dev \
        pybind11-dev
    rm -rf /var/lib/apt/lists/*
    update-alternatives --install /usr/bin/python python /usr/bin/python3 10
    echo "=== [Python] Upgrade pip and basic tools ==="
    python -m pip install --upgrade --no-cache-dir pip wheel setuptools
    echo "=== [Torch] Install PyTorch (CUDA 12.4 wheels) ==="
    python -m pip install --no-cache-dir \
        torch==2.6.0+cu124 torchvision==0.21.0+cu124 torchaudio==2.6.0+cu124 \
        --index-url https://download.pytorch.org/whl/cu124
    echo "=== [MACE stack] Install mace-torch, mace-models, deps ==="
    python -m pip install --no-cache-dir \
        cython \
        numpy \
        scipy \
        ase \
        h5py \
        matplotlib \
        pyfftw \
        cupy-cuda12x \
        e3nn \
        mace-torch==0.3.14 \
        mace-models
    echo "=== [PIP] Install cuEquivariance 0.7.0 (base + torch + ops-cu12) ==="
    python -m pip install --no-cache-dir \
        cuequivariance==0.7.0 \
        cuequivariance-torch==0.7.0 \
        cuequivariance-ops-torch-cu12==0.7.0
    echo "=== [UCX] Build UCX with CUDA support ==="
    export UCX_VERSION=1.16.0
    export UCX_PREFIX=/opt/ucx
    cd /tmp
    wget -q https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz
    tar -xf ucx-${UCX_VERSION}.tar.gz
    cd ucx-${UCX_VERSION}
    ./configure \
        --prefix=${UCX_PREFIX} \
        --with-cuda=/usr/local/cuda \
        --with-verbs \
        --enable-mt
    make -j"$(nproc)"
    make install
    cd /
    rm -rf /tmp/ucx*
    echo "=== [OpenMPI] Build CUDA + UCX aware OpenMPI for SGE-launched mpirun ==="
    export OMPI_VERSION=4.1.6
    export OMPI_PREFIX=/opt/openmpi
    cd /tmp
    wget -q https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz
    tar -xf openmpi-${OMPI_VERSION}.tar.gz
    cd openmpi-${OMPI_VERSION}
    ./configure \
        --prefix=${OMPI_PREFIX} \
        --with-cuda=/usr/local/cuda \
        --with-ucx=${UCX_PREFIX} \
        --enable-mpi-cxx \
        --disable-static \
        --enable-shared
    make -j"$(nproc)"
    make install
    ln -sf ${OMPI_PREFIX}/bin/mpirun     /usr/bin/mpirun
    ln -sf ${OMPI_PREFIX}/bin/mpiexec    /usr/bin/mpiexec
    ln -sf ${OMPI_PREFIX}/bin/ompi_info  /usr/bin/ompi_info
    cd /
    rm -rf /tmp/openmpi*
    echo "=== [mpi4py] Build against our OpenMPI ==="
    export PATH=${OMPI_PREFIX}/bin:${PATH}
    export LD_LIBRARY_PATH=${OMPI_PREFIX}/lib:${LD_LIBRARY_PATH}
    MPICC=${OMPI_PREFIX}/bin/mpicc python -m pip install --no-cache-dir mpi4py
    echo "=== [LAMMPS] Clone stable_22Jul2025 and build with KOKKOS(CUDA sm80) + ML-IAP ==="
    export LAMMPS_PREFIX=/opt/lammps
    mkdir -p "${LAMMPS_PREFIX}"
    cd /tmp
    rm -rf /tmp/lammps
    git clone https://github.com/lammps/lammps.git
    cd lammps
    git checkout stable_22Jul2025
    mkdir -p build
    cd build
    export MPI_HOME=${OMPI_PREFIX}
    export PATH=${MPI_HOME}/bin:${PATH}
    export LD_LIBRARY_PATH=${MPI_HOME}/lib:${LD_LIBRARY_PATH}
    export CC=mpicc
    export CXX=mpicxx
    echo "=== [CUDA driver stubs] Make libcuda visible at build/link time ==="
    export CUDA_HOME=/usr/local/cuda
    export CUDA_STUBS=${CUDA_HOME}/lib64/stubs
    ln -sf ${CUDA_STUBS}/libcuda.so ${CUDA_STUBS}/libcuda.so.1
    export LIBRARY_PATH=${CUDA_STUBS}:${LIBRARY_PATH:-}
    # Use LAMMPS' preset as a base
    cp ../cmake/presets/kokkos-cuda.cmake ./
    # Young GPUs are A100 (sm_80): build SASS-only to avoid PTX-version runtime issues.
    cmake -C kokkos-cuda.cmake \
        -S ../cmake -B . \
        -D CMAKE_BUILD_TYPE=Release \
        -D CMAKE_INSTALL_PREFIX=${LAMMPS_PREFIX} \
        -D BUILD_MPI=ON \
        -D BUILD_SHARED_LIBS=ON \
        -D PKG_ML-IAP=ON \
        -D PKG_ML-SNAP=ON \
        -D MLIAP_ENABLE_PYTHON=ON \
        -D PKG_PYTHON=ON \
        -D Kokkos_ARCH_AMPERE80=ON \
        -D CMAKE_CUDA_ARCHITECTURES=80-real \
        -D CMAKE_SHARED_LINKER_FLAGS="-Wl,-rpath-link,${CUDA_STUBS} -L${CUDA_STUBS}" \
        -D CMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,${CUDA_STUBS} -L${CUDA_STUBS}" \
        -D CMAKE_INSTALL_RPATH="${LAMMPS_PREFIX}/lib;${OMPI_PREFIX}/lib;${UCX_PREFIX}/lib;/usr/local/cuda/lib64" \
        -D CMAKE_BUILD_WITH_INSTALL_RPATH=ON
    cmake --build . -j"$(nproc)"
    cmake --install .
    cmake --build . --target install-python
    ln -sf ${LAMMPS_PREFIX}/bin/lmp /usr/bin/lmp
    echo "=== [Wrapper] Young SGE-friendly launcher (uses scheduler machinefile when present) ==="
    cat >/usr/local/bin/lmp_sge <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
MPIRUN=/opt/openmpi/bin/mpirun
LMP=/opt/lammps/bin/lmp
NP="${NSLOTS:-1}"
if [[ "${1:-}" == "-np" ]]; then
  NP="$2"
  shift 2
fi
HOSTFILE=""
if [[ -n "${TMPDIR:-}" && -f "${TMPDIR}/machines" ]]; then
  HOSTFILE="${TMPDIR}/machines"
elif [[ -n "${PE_HOSTFILE:-}" && -f "${PE_HOSTFILE}" ]]; then
  HOSTFILE="${TMPDIR:-/tmp}/openmpi.hostfile.${JOB_ID:-$$}"
  awk 'NF >= 2 { printf "%s slots=%s\n", $1, $2 }' "${PE_HOSTFILE}" > "${HOSTFILE}"
fi
# If the user already set OMP_NUM_THREADS keep it. Otherwise, for the common
# single-rank case, use the allocated slot count as a sensible default.
if [[ -z "${OMP_NUM_THREADS:-}" && "${NP}" == "1" && -n "${NSLOTS:-}" ]]; then
  export OMP_NUM_THREADS="${NSLOTS}"
fi
if [[ "${1:-}" == "--" ]]; then
  shift
  LMP_ARGS=("$@")
else
  LMP_ARGS=(-k on g 1 -sf kk -pk kokkos gpu/aware on neigh half newton on "$@")
fi
MPI_ARGS=(--allow-run-as-root -np "${NP}" --bind-to none)
if [[ -n "${HOSTFILE}" ]]; then
  MPI_ARGS+=(--hostfile "${HOSTFILE}")
fi
exec "${MPIRUN}" "${MPI_ARGS[@]}" bash -lc '
set -euo pipefail
LR=${OMPI_COMM_WORLD_LOCAL_RANK:-0}
# GPU selection:
#  - Preserve CUDA_VISIBLE_DEVICES if the job script already set it.
#  - GPU_LIST="1,2,3,4" maps rank->GPU (cycles if fewer GPUs than ranks).
#  - GPU_OFFSET=1 maps rank0->GPU1, rank1->GPU2, ...
if [[ -n "${CUDA_VISIBLE_DEVICES:-}" ]]; then
  :
elif [[ -n "${GPU_LIST:-}" ]]; then
  IFS=, read -ra GPUS <<< "${GPU_LIST}"
  if (( ${#GPUS[@]} == 1 )); then
    export CUDA_VISIBLE_DEVICES="${GPUS[0]}"
  else
    idx=$(( LR % ${#GPUS[@]} ))
    export CUDA_VISIBLE_DEVICES="${GPUS[$idx]}"
  fi
elif [[ -n "${GPU_OFFSET:-}" ]]; then
  export CUDA_VISIBLE_DEVICES="$((LR + GPU_OFFSET))"
else
  export CUDA_VISIBLE_DEVICES="${LR}"
fi
exec /opt/lammps/bin/lmp "$@"
' bash "${LMP_ARGS[@]}"
EOF
    chmod +x /usr/local/bin/lmp_sge
    ln -sf /usr/local/bin/lmp_sge /usr/bin/lmp_sge
    echo "=== [Cleanup] Remove tmp build dirs ==="
    cd /
    rm -rf /tmp/lammps
    echo "=== [Done] Build complete ==="
%runscript
    exec "$@"