From ac36a48823917edba29e7f59f5690a4372b0da72 Mon Sep 17 00:00:00 2001 From: Jan Kieseler Date: Wed, 29 Jan 2020 15:51:26 +0100 Subject: [PATCH] 2.1 --- DataCollection.py | 5 +++++ TrainData.py | 1 + compiled/interface/trainData.h | 2 +- docker/Dockerfile | 28 +++++++++++++++++++++++++++- docker/Dockerfile_base | 1 + training/DeepJet_callbacks.py | 10 +++++----- training/training_base.py | 21 ++++++++++++++------- 7 files changed, 54 insertions(+), 14 deletions(-) diff --git a/DataCollection.py b/DataCollection.py index 859ef3c..36da23c 100644 --- a/DataCollection.py +++ b/DataCollection.py @@ -92,9 +92,14 @@ def __radd__(self, other): else: raise ValueError("I don't know how to add DataCollection and %s" % type(other)) + def __len__(self): + return len(self.samples) + def _readShapesIfNeeded(self): if len(self.samples)<1: return + if self.dataclass_instance is None: + self.dataclass_instance = self.dataclass() if self.dataclass_instance.nElements() < 1: self.dataclass_instance.readShapesFromFile(self.getSamplePath(self.samples[0])) diff --git a/TrainData.py b/TrainData.py index f27fdfd..bfc14c0 100644 --- a/TrainData.py +++ b/TrainData.py @@ -14,6 +14,7 @@ from DeepJetCore.compiled.c_trainData import trainData from DeepJetCore.compiled.c_simpleArray import simpleArray +import time def fileTimeOut(fileName, timeOut): ''' diff --git a/compiled/interface/trainData.h b/compiled/interface/trainData.h index a4866e9..289a1fd 100644 --- a/compiled/interface/trainData.h +++ b/compiled/interface/trainData.h @@ -317,7 +317,7 @@ template void trainData::readFromFile(std::string filename){ clear(); FILE *ifile = fopen(filename.data(), "rb"); - checkFile(ifile); + checkFile(ifile, filename); readNested(feature_shapes_, ifile); readNested(truth_shapes_, ifile); readNested(weight_shapes_, ifile); diff --git a/docker/Dockerfile b/docker/Dockerfile index 959154b..41b7ad4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -9,7 +9,6 @@ RUN cd /usr/share && \ cd DJC && \ git clone https://github.com/DL4Jets/DeepJetCore && \ cd DeepJetCore && \ - git checkout 2.1rc01 && \ source docker_env.sh && \ cd compiled && \ make -j4 @@ -20,3 +19,30 @@ ENV PATH="/usr/share/DJC/DeepJetCore/bin:${PATH}" #/usr/local/lib is for root ENV PYTHONPATH="/usr/share/DJC/DeepJetCore/../:${PYTHONPATH}" ENV LD_LIBRARY_PATH="/usr/share/DJC/DeepJetCore/compiled:${LD_LIBRARY_PATH}" + + +# helpers for ragged and cuda compilation + + + +# The fix for TensorFlow + +RUN cd /usr/local/lib/python2.7/dist-packages/tensorflow_core/include/third_party && \ + mkdir gpus && \ + cd gpus && \ + ln -s /usr/local/cuda cuda + + + +#eclipse rse stuff - make extern + +RUN sed -i "s,# deb http://archive.canonical.com/ubuntu,deb http://archive.canonical.com/ubuntu,g" /etc/apt/sources.list +RUN apt update +# do not upgrade all, because of cudnn versions etc!!! +#for eclipse stuff +RUN apt install -y default-jre +RUN apt install -y openjdk-11-jre-headless +RUN apt install -y openjdk-8-jre-headless +RUN apt install -y default-jdk +#RUN apt install -y openjdk-9-jre-headless + \ No newline at end of file diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base index db526b9..316bb3f 100644 --- a/docker/Dockerfile_base +++ b/docker/Dockerfile_base @@ -76,6 +76,7 @@ RUN pip install gpustat RUN pip install setGPU +RUN apt install -y unzip diff --git a/training/DeepJet_callbacks.py b/training/DeepJet_callbacks.py index d9f2685..2616e6d 100644 --- a/training/DeepJet_callbacks.py +++ b/training/DeepJet_callbacks.py @@ -253,13 +253,14 @@ def __init__(self, batchsize=10, on_epoch_end=False, use_event=0, - decay_function=None + decay_function=None, + offset=0 ): super(PredictCallback, self).__init__() self.samplefile=samplefile self.function_to_apply=function_to_apply self.counter=0 - self.call_counter=0 + self.call_counter=offset self.decay_function=decay_function self.after_n_batches=after_n_batches @@ -276,7 +277,6 @@ def __init__(self, self.batchsize = 1 self.td = td - self.gen = trainDataGenerator() self.gen.setBatchSize(batchsize) self.gen.setSkipTooLargeBatches(False) @@ -309,8 +309,6 @@ def genfunc(): def on_epoch_end(self, epoch, logs=None): self.counter=0 - if self.decay_function is not None: - self.after_n_batches=self.decay_function(self.after_n_batches) if not self.run_on_epoch_end: return self.predict_and_call(epoch) @@ -320,6 +318,8 @@ def on_batch_end(self, batch, logs=None): if self.counter>self.after_n_batches: self.counter=0 self.predict_and_call(batch) + if self.decay_function is not None: + self.after_n_batches=self.decay_function(self.call_counter) diff --git a/training/training_base.py b/training/training_base.py index 9ef6bef..f7af9c8 100644 --- a/training/training_base.py +++ b/training/training_base.py @@ -18,6 +18,7 @@ import tensorflow as tf import tensorflow.keras as keras from keras.utils import multi_gpu_model +import copy import imp try: @@ -199,8 +200,12 @@ def __init__( self.train_data.useweights=useweights if testrun: - self.train_data.split(testrun_fraction) - self.val_data=self.train_data + if len(self.train_data)>1: + self.train_data.split(testrun_fraction) + + self.train_data.dataclass_instance=None #can't be pickled + self.val_data=copy.deepcopy(self.train_data) + else: self.val_data=self.train_data.split(splittrainandtest) @@ -238,11 +243,12 @@ def modelSet(self): def setModel(self,model,**modelargs): if len(self.keras_inputs)<1: raise Exception('setup data first') - try: - self.keras_model=model(self.keras_inputs,**modelargs) - except BaseException as e: - print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore') - raise e + self.keras_model=model(self.keras_inputs,**modelargs) + #try: + # self.keras_model=model(self.keras_inputs,**modelargs) + #except BaseException as e: + # print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore') + # raise e if not self.keras_model: raise Exception('Setting model not successful') @@ -403,6 +409,7 @@ def trainModel(self, if not isinstance(additional_callbacks, list): additional_callbacks=[additional_callbacks] self.callbacks.callbacks.extend(additional_callbacks) + print('starting training') if load_in_mem: