-
Notifications
You must be signed in to change notification settings - Fork 970
/
Copy pathDockerfile.tmpl
216 lines (174 loc) · 10 KB
/
Dockerfile.tmpl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
ARG BASE_IMAGE \
BASE_IMAGE_TAG \
LIGHTGBM_VERSION
{{ if eq .Accelerator "gpu" }}
FROM gcr.io/kaggle-images/python-lightgbm-whl:${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
{{ end }}
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG}
ADD kaggle_requirements.txt /kaggle_requirements.txt
# Freeze existing requirements from base image for critical packages:
RUN pip freeze | grep -E 'tensorflow|keras|torch|jax|lightgbm' > /colab_requirements.txt
# Merge requirements files:
RUN cat /colab_requirements.txt >> /requirements.txt
RUN cat /kaggle_requirements.txt >> /requirements.txt
# TODO: GPU requirements.txt
# TODO: merge them better (override matching ones).
# Install uv & Kaggle packages
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
RUN export PATH="${HOME}/.local/bin:${PATH}" && uv pip install --system -r /requirements.txt
ENV PATH="~/.local/bin:${PATH}"
# Install manual packages:
# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data.
RUN uv pip uninstall --system google-cloud-bigquery-storage
# uv cannot install this in requirements.txt without --no-build-isolation
# to avoid affecting the larger build, we'll post-install it.
RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools"
# b/385161357 Latest Colab uses tf 2.17.1, but tf decision forests only has a version for 2.17.0.
# Instead, we'll install tfdf with its deps and hope that 2.17.0 compat tfdf works with tf 2.17.1.
RUN uv pip install --system --no-deps tensorflow-decision-forests==1.10.0 wurlitzer==3.1.1 ydf==0.9.0
# b/385145217 Latest Colab lacks mkl numpy, install it.
RUN uv pip install --system --force-reinstall -i https://pypi.anaconda.org/intel/simple numpy
# b/328788268 We install an incompatible pair of libs (shapely<2, libpysal==4.9.2) so we can't put this one in the requirements.txt
# newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason
RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2"
# Adding non-package dependencies:
ADD clean-layer.sh /tmp/clean-layer.sh
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
ADD patches/template_conf.json /opt/kaggle/conf.json
# /opt/conda/lib/python3.10/site-packages
ARG PACKAGE_PATH=/usr/local/lib/python3.10/dist-packages
# Install GPU-specific non-pip packages.
{{ if eq .Accelerator "gpu" }}
ARG CUDA_MAJOR_VERSION \
CUDA_MINOR_VERSION
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION} \
CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
# Make sure we are on the right version of CUDA
RUN update-alternatives --set cuda /usr/local/cuda-$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION
RUN uv pip install --system "pycuda"
# b/381256047 Remove once installed in Colabs base image.
# Install LightGBM
COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/
# Install OpenCL (required by LightGBM GPU version)
RUN apt-get install -y ocl-icd-libopencl1 clinfo && \
mkdir -p /etc/OpenCL/vendors && \
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
uv pip install --system /tmp/lightgbm/*.whl && \
rm -rf /tmp/lightgbm && \
/tmp/clean-layer.sh
# Remove CUDA_VERSION from non-GPU image.
{{ else }}
ENV CUDA_VERSION=""
{{ end }}
# Update GPG key per documentation at https://cloud.google.com/compute/docs/troubleshooting/known-issues
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
# Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections,
# as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346
RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \
apt-get update --allow-releaseinfo-change && \
# Needed by lightGBM (GPU build)
# https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
apt-get install -y build-essential unzip cmake libboost-dev libboost-system-dev libboost-filesystem-dev p7zip-full && \
# b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines.
apt-get install -y openssh-client && \
apt-get install -y graphviz && pip install graphviz && \
/tmp/clean-layer.sh
ADD patches/keras_internal.py \
patches/keras_internal_test.py \
$PACKAGE_PATH/tensorflow_decision_forests/keras/
RUN apt-get install -y libfreetype6-dev && \
apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing
# NLTK Project datasets
RUN mkdir -p /usr/share/nltk_data && \
# NLTK Downloader no longer continues smoothly after an error, so we explicitly list
# the corpuses that work
python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \
basque_grammars biocreative_ppi bllip_wsj_no_aux \
book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \
comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \
europarl_raw floresta gazetteers genesis gutenberg \
ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \
masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \
mte_teip5 names nps_chat omw opinion_lexicon paradigms \
pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \
pros_cons ptb punkt qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \
sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \
state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \
twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \
vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe
RUN apt-get install -y git-lfs && \
# vtk dependencies
apt-get install -y libgl1-mesa-glx && \
# xvfbwrapper dependencies
apt-get install -y xvfb && \
/tmp/clean-layer.sh
# Download base easyocr models.
# https://github.com/JaidedAI/EasyOCR#usage
RUN mkdir -p /root/.EasyOCR/model && \
wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \
unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \
rm /root/.EasyOCR/model/latin.zip && \
wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \
unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \
rm /root/.EasyOCR/model/english.zip && \
wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \
unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \
rm /root/.EasyOCR/model/craft_mlt_25k.zip && \
/tmp/clean-layer.sh
# Tesseract and some associated utility packages
RUN apt-get install tesseract-ocr -y
ENV TESSERACT_PATH=/usr/bin/tesseract \
# For Facets, we also include an empty path to include $PWD.
PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \
# For Theano with MKL
MKL_THREADING_LAYER=GNU
# Temporary fixes and patches
# Stop jupyter nbconvert trying to rewrite its folder hierarchy
RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \
mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \
# Make matplotlib output in Jupyter notebooks display correctly
mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \
/tmp/clean-layer.sh
# Fix to import bq_helper library without downgrading setuptools
RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \
mkdir -p ~/src/BigQuery_Helper/bq_helper && \
mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \
mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \
sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \
uv pip install --system -e ~/src/BigQuery_Helper && \
/tmp/clean-layer.sh
# install imagemagick for wand
# https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu
RUN apt-get install libmagickwand-dev
# Override default imagemagick policies
ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml
# Add Kaggle module resolver
ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py
RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \
sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py
# Add BigQuery client proxy settings
ENV PYTHONUSERBASE="/root/.local"
ADD patches/kaggle_gcp.py \
patches/kaggle_secrets.py \
patches/kaggle_session.py \
patches/kaggle_web_client.py \
patches/kaggle_datasets.py \
patches/log.py \
$PACKAGE_PATH/
# Figure out why this is in a different place?
# Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it.
ADD patches/sitecustomize.py /usr/lib/python3.10/sitecustomize.py
ARG GIT_COMMIT=unknown \
BUILD_DATE=unknown
LABEL git-commit=$GIT_COMMIT \
build-date=$BUILD_DATE
ENV GIT_COMMIT=${GIT_COMMIT} \
BUILD_DATE=${BUILD_DATE}
# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date
{{ if eq .Accelerator "gpu" }}
# Add the CUDA home.
ENV CUDA_HOME=/usr/local/cuda
{{ end }}
ENTRYPOINT ["/usr/bin/env"]