Aryan V S

CLI stash

sinfo -o "%N %c %m %G"

NODELIST CPUS MEMORY GRES
cluster2qh-nodeset1-[0-3] 224 2933888 gpu:8
srun --gres=gpu:8 --cpus-per-task=32 --mem=2000G --pty bash -i
for d in ckpt_step*; do     step=$(echo "$d" | grep -o '[0-9]\+' | sed 's/^0*//');     if (( step % 500 == 0 )); then         aws s3 cp --recursive "$d"         s3://<BUCKET>/aryan/lora/teacher-forcing/1p3b---lr-1e-4---wd-1e-5---ema-0p99---rank-256/"$d";     fi; done
sudo apt install -y fio
fio --name=rwtest --rw=randrw --size=1G --bs=4k --numjobs=4 --runtime=60 --directory=/path/to/volume --group_reporting
sudo systemctl stop docker docker.socket containerd
# Edit /etc/docker/daemon.json to have {"data-root": "/scratch/local/.docker"} (basically, point it to ext4 system)
sudo systemctl start docker docker.socket containerd
docker info -f '{{ .DockerRootDir}}'
sudo docker build --progress=tty -t cu128-pytorch28:latest .
export CUR_TMPDIR=$TMPDIR
export TMPDIR=/scratch/shared/.tmp-enroot
export ENROOT_DATA_PATH=/scratch/shared/.enroot
export ENROOT_CACHE_PATH=/scratch/shared/.enroot/cache
export ENROOT_RUNTIME_PATH=/scratch/shared/.enroot/runtime
mkdir -p $TMPDIR $ENROOT_DATA_PATH $ENROOT_CACHE_PATH $ENROOT_RUNTIME_PATH
enroot import docker://<USERNAME>@<ORGANIZATION>/cu128-pytorch28:latest
export TMPDIR=$CUR_TMPDIR
pssh -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -t 0 -h hosts.txt -i 'bash -lc "sudo apt update && sudo apt install -y acl"'
pssh -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -t 0 -h hosts.txt -i 'bash -lc "sudo setfacl -m d:u::rwx,d:g::rwx,d:o::rwx /scratch/local"'
pssh -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -t 0 -h hosts.txt -i 'bash -lc "sudo setfacl -m u::rwx,g::rwx,o::rwx /scratch/local"'
pssh -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -t 0 -h hosts.txt -i 'bash -lc "sudo chmod 1777 /scratch/local"'