From ac36a48823917edba29e7f59f5690a4372b0da72 Mon Sep 17 00:00:00 2001
From: Jan Kieseler <jkiesele@cern.ch>
Date: Wed, 29 Jan 2020 15:51:26 +0100
Subject: [PATCH] 2.1

---
 DataCollection.py              |  5 +++++
 TrainData.py                   |  1 +
 compiled/interface/trainData.h |  2 +-
 docker/Dockerfile              | 28 +++++++++++++++++++++++++++-
 docker/Dockerfile_base         |  1 +
 training/DeepJet_callbacks.py  | 10 +++++-----
 training/training_base.py      | 21 ++++++++++++++-------
 7 files changed, 54 insertions(+), 14 deletions(-)
diff --git a/DataCollection.py b/DataCollection.py
index 859ef3c..36da23c 100644
--- a/DataCollection.py
+++ b/DataCollection.py
@@ -92,9 +92,14 @@ def __radd__(self, other):
         else:
             raise ValueError("I don't know how to add DataCollection and %s" % type(other))
         
+    def __len__(self):
+        return len(self.samples)
+    
     def _readShapesIfNeeded(self):
         if len(self.samples)<1:
             return
+        if self.dataclass_instance is None:
+            self.dataclass_instance = self.dataclass()
         if self.dataclass_instance.nElements() < 1:
             self.dataclass_instance.readShapesFromFile(self.getSamplePath(self.samples[0]))
         
diff --git a/TrainData.py b/TrainData.py
index f27fdfd..bfc14c0 100644
--- a/TrainData.py
+++ b/TrainData.py
@@ -14,6 +14,7 @@
 
 from DeepJetCore.compiled.c_trainData import trainData
 from DeepJetCore.compiled.c_simpleArray import simpleArray
+import time
 
 def fileTimeOut(fileName, timeOut):
     '''
diff --git a/compiled/interface/trainData.h b/compiled/interface/trainData.h
index a4866e9..289a1fd 100644
--- a/compiled/interface/trainData.h
+++ b/compiled/interface/trainData.h
@@ -317,7 +317,7 @@ template<class T>
 void trainData<T>::readFromFile(std::string filename){
     clear();
     FILE *ifile = fopen(filename.data(), "rb");
-    checkFile(ifile);
+    checkFile(ifile, filename);
     readNested(feature_shapes_, ifile);
     readNested(truth_shapes_, ifile);
     readNested(weight_shapes_, ifile);
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 959154b..41b7ad4 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -9,7 +9,6 @@ RUN cd /usr/share && \
     cd DJC && \
     git clone https://github.com/DL4Jets/DeepJetCore && \
     cd DeepJetCore && \
-    git checkout 2.1rc01 && \
     source docker_env.sh && \
     cd compiled && \
     make -j4
@@ -20,3 +19,30 @@ ENV PATH="/usr/share/DJC/DeepJetCore/bin:${PATH}"
 #/usr/local/lib is for root
 ENV PYTHONPATH="/usr/share/DJC/DeepJetCore/../:${PYTHONPATH}"
 ENV LD_LIBRARY_PATH="/usr/share/DJC/DeepJetCore/compiled:${LD_LIBRARY_PATH}"
+
+
+# helpers for ragged and cuda compilation
+
+
+
+# The fix for TensorFlow
+
+RUN cd /usr/local/lib/python2.7/dist-packages/tensorflow_core/include/third_party && \
+    mkdir gpus && \
+    cd gpus && \
+    ln -s /usr/local/cuda cuda
+
+
+
+#eclipse rse stuff - make extern
+
+RUN sed -i "s,# deb http://archive.canonical.com/ubuntu,deb http://archive.canonical.com/ubuntu,g" /etc/apt/sources.list
+RUN apt update
+# do not upgrade all, because of cudnn versions etc!!!
+#for eclipse stuff
+RUN apt install -y default-jre 
+RUN apt install -y openjdk-11-jre-headless 
+RUN apt install -y openjdk-8-jre-headless 
+RUN apt install -y default-jdk
+#RUN apt install -y openjdk-9-jre-headless 
+    
\ No newline at end of file
diff --git a/docker/Dockerfile_base b/docker/Dockerfile_base
index db526b9..316bb3f 100644
--- a/docker/Dockerfile_base
+++ b/docker/Dockerfile_base
@@ -76,6 +76,7 @@ RUN pip install gpustat
 RUN pip install setGPU
 
 
+RUN apt install -y unzip
 
 
 
diff --git a/training/DeepJet_callbacks.py b/training/DeepJet_callbacks.py
index d9f2685..2616e6d 100644
--- a/training/DeepJet_callbacks.py
+++ b/training/DeepJet_callbacks.py
@@ -253,13 +253,14 @@ def __init__(self,
                  batchsize=10,
                  on_epoch_end=False,
                  use_event=0,
-                 decay_function=None
+                 decay_function=None,
+                 offset=0
                  ):
         super(PredictCallback, self).__init__()
         self.samplefile=samplefile
         self.function_to_apply=function_to_apply
         self.counter=0
-        self.call_counter=0
+        self.call_counter=offset
         self.decay_function=decay_function
         
         self.after_n_batches=after_n_batches
@@ -276,7 +277,6 @@ def __init__(self,
             
         self.batchsize = 1    
         self.td = td
-        
         self.gen = trainDataGenerator()
         self.gen.setBatchSize(batchsize)
         self.gen.setSkipTooLargeBatches(False)
@@ -309,8 +309,6 @@ def genfunc():
     
     def on_epoch_end(self, epoch, logs=None):
         self.counter=0
-        if self.decay_function is not None:
-            self.after_n_batches=self.decay_function(self.after_n_batches)
         if not self.run_on_epoch_end: return
         self.predict_and_call(epoch)
         
@@ -320,6 +318,8 @@ def on_batch_end(self, batch, logs=None):
         if self.counter>self.after_n_batches: 
             self.counter=0
             self.predict_and_call(batch)
+            if self.decay_function is not None:
+                self.after_n_batches=self.decay_function(self.call_counter)
         
         
            
diff --git a/training/training_base.py b/training/training_base.py
index 9ef6bef..f7af9c8 100644
--- a/training/training_base.py
+++ b/training/training_base.py
@@ -18,6 +18,7 @@
 import tensorflow as tf
 import tensorflow.keras as keras
 from keras.utils import multi_gpu_model
+import copy
 
 import imp
 try:
@@ -199,8 +200,12 @@ def __init__(
         self.train_data.useweights=useweights
         
         if testrun:
-            self.train_data.split(testrun_fraction)
-            self.val_data=self.train_data
+            if len(self.train_data)>1:
+                self.train_data.split(testrun_fraction)
+
+            self.train_data.dataclass_instance=None #can't be pickled
+            self.val_data=copy.deepcopy(self.train_data)
+            
         else:    
             self.val_data=self.train_data.split(splittrainandtest)
         
@@ -238,11 +243,12 @@ def modelSet(self):
     def setModel(self,model,**modelargs):
         if len(self.keras_inputs)<1:
             raise Exception('setup data first') 
-        try:
-            self.keras_model=model(self.keras_inputs,**modelargs)
-        except BaseException as e:
-            print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore')
-            raise e
+        self.keras_model=model(self.keras_inputs,**modelargs)
+        #try:
+        #    self.keras_model=model(self.keras_inputs,**modelargs)
+        #except BaseException as e:
+        #    print('problem in setting model. Reminder: since DJC 2.0, NClassificationTargets and RegressionTargets must not be specified anymore')
+        #    raise e
         if not self.keras_model:
             raise Exception('Setting model not successful') 
         
@@ -403,6 +409,7 @@ def trainModel(self,
             if not isinstance(additional_callbacks, list):
                 additional_callbacks=[additional_callbacks]
             self.callbacks.callbacks.extend(additional_callbacks)
+            
         
         print('starting training')
         if load_in_mem: