- Prepare symlicks:
Todo nltk, dataset, transformers badaboum :)
setup sentence transformers as well
ln -s $six_ALL_CCFRWORK/PII/cache_dir/.neuralcoref $HOME/.neuralcoref
ln -s $six_ALL_CCFRWORK/PII/cache_dir/.cache/transformers $HOME/.cache/transformers
ln -s $six_ALL_CCFRWORK/PII/cache_dir/.cache/huggingface $HOME/.cache/huggingface
ln -s $six_ALL_CCFRWORK/PII/cache_dir/nltk_data $HOME/nltk_data
- Check current project:
idrproj
- Set the HF project:
eval $(idrenv -d six)
- Go in $six_ALL_CCFRWORK/PII
cd $six_ALL_CCFRWORK/PII
- Get the repo
git clone https://github.com/ontocord/muliwai.git
- Set up python stuffs:
module load pytorch-gpu/py3/1.7.0
cd muliwai
pip install -r requirements_pierre_spacy.txt
- Check if everything works on the dev node with a single GPU
srun --pty --partition=prepost --account=six@gpu --nodes=1 --ntasks=1 --cpus-per-task=10 --gres=gpu:0 --hint=nomultithread --time=1:00:00 bash
python process.py -src_lang zh -cutoff 30 -preload_cache
- Check if everything works on the prod node without internet
srun --pty --partition=prepost --account=six@gpu --nodes=1 --ntasks=1 --cpus-per-task=10 --gres=gpu:0 --hint=nomultithread --time=1:00:00 bash
export HF_DATASETS_OFFLINE=1 # ugly but Dataset and Transformers are bugged
export TRANSFORMERS_OFFLINE=1
time python process.py -src_lang zh -cutoff 30
with commit 85c8737 we got
real 11m47.634s
user 6m42.942s
sys 1m43.794s
- Target more gpus
export GPU_NUMBERS=3
srun --pty -A six@gpu --nodes=1 --ntasks=1 --cpus-per-task=10 --gres=gpu:$GPU_NUMBERS --hint=nomultithread --time=60 bash
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
time python process.py -src_lang zh -num_workers=$GPU_NUMBERS -cutoff 30
with commit 85c8737 we got