forked from lm-sys/FastChat
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsync_local_checkpoint.sh
28 lines (24 loc) · 1.13 KB
/
sync_local_checkpoint.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/bin/bash
local_path=$1
remote_path=$2
MAX_NUM_CKPT=3
# This script is used to periodically copy local checkpoint to mounted storage
while true; do
local_last_ckpt=$(ls ${local_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
remote_last_ckpt=$(ls ${remote_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | tail -1)
echo "local_last_ckpt: ${local_last_ckpt}"
echo "remote_last_ckpt: ${remote_last_ckpt}"
if [ "${local_last_ckpt}" != "${remote_last_ckpt}" ]; then
mkdir -p ${remote_path}/${local_last_ckpt}
gsutil -m rsync -r ${local_path}/${local_last_ckpt}/ ${remote_path}/${local_last_ckpt}
# Keep only the last MAX_NUM_CKPT checkpoints
num_local_ckpt=$(ls ${local_path} | grep checkpoint- | wc -l)
echo "num_local_ckpt: ${num_local_ckpt}"
if [ ${num_local_ckpt} -gt $MAX_NUM_CKPT ]; then
for ckpt in $(ls ${local_path} | grep checkpoint- | grep -E '[0-9]+' | sort -t'-' -k1,1 -k2,2n | head -n-${MAX_NUM_CKPT}); do
rm -rf ${local_path}/${ckpt}
done
fi
fi
sleep 600
done