Merge pull request #135 from automl/development

[WIP] prepare release
automl · Dec 9, 2016 · 7430205 · 7430205
2 parents d7aa29b + 65a75d6
commit 7430205
Show file tree

Hide file tree

Showing 17 changed files with 208 additions and 149 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,19 @@
+# 0.2.2
+
+* FIX 124: SMAC could crash if the number of instances was less than seven
+* FIX 126: Memory limit was not correctly passed to the target algorithm
+  evaluator
+* Local search is now started from the configurations with highest EI, drawn by
+  random sampling
+* Reduce the number of trees to 10 to allow faster predictions (as in SMAC2)
+* Do an adaptive number of stochastic local search iterations instead of a fixd
+  number (a5914a1d97eed2267ae82f22bd53246c92fe1e2c)
+* FIX a bug which didn't make SMAC run at least two configurations per call to
+  intensify
+* ADD more efficient data structure to update the cost of a configuration
+* FIX do only count a challenger as a run if it actually was run
+  (and not only considered)(a993c29abdec98c114fc7d456ded1425a6902ce3)
+
 # 0.2.1
 
 * CI: travis-ci continuous integration on OSX
@@ -61,4 +77,4 @@
 
 # 0.0.1
 
-* initial release
+* initial release
diff --git a/examples/branin/branin_scenario.txt b/examples/branin/branin_scenario.txt
@@ -2,3 +2,4 @@ algo = python branin.py
 paramfile = branin_pcs.pcs
 run_obj = quality
 runcount_limit = 500
+deterministic = 1
diff --git a/examples/rf.py b/examples/rf.py
@@ -109,22 +109,21 @@ def rfr(cfg, seed):
                      "deterministic": "true",
                      "memory_limit": 1024,
                      })
-
-# register function to be optimize
-taf = ExecuteTAFuncDict(rfr)
-
+
+# Optimize
+smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
+            tae_runner=rfr)
+
 # example call of the function
 # it returns: Status, Cost, Runtime, Additional Infos
-def_value = taf.run(cs.get_default_configuration())[1]
+def_value = smac.solver.intensifier.tae_runner.run(
+    cs.get_default_configuration(), 1)[1]
 print("Default Value: %.2f" % (def_value))
-
-# Optimize
-smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
-            tae_runner=taf)
+
 try:
     incumbent = smac.optimize()
 finally:
     incumbent = smac.solver.incumbent
 
-inc_value = taf.run(incumbent)[1]
+inc_value = smac.solver.intensifier.tae_runner.run(incumbent, 1)[1]
 print("Optimized Value: %.2f" % (inc_value))
diff --git a/smac/__init__.py b/smac/__init__.py
@@ -1,3 +1,3 @@
-VERSION = '0.2.1'
+VERSION = '0.2.2'
 AUTHORS = "Marius Lindauer, Matthias Feurer, Katharina Eggensperger, " \
-          "Aaron Klein, Stefan Falkner and Frank Hutter"
+          "Aaron Klein, Stefan Falkner and Frank Hutter"
diff --git a/smac/epm/rf_with_instances.py b/smac/epm/rf_with_instances.py
@@ -53,7 +53,7 @@ class RandomForestWithInstances(AbstractEPM):
 
     def __init__(self, types,
                  instance_features=None,
-                 num_trees=30,
+                 num_trees=10,
                  do_bootstrapping=True,
                  n_points_per_tree=0,
                  ratio_features=5. / 6.,

diff --git a/smac/facade/smac_facade.py b/smac/facade/smac_facade.py
@@ -143,6 +143,7 @@ def __init__(self,
             tae_runner = ExecuteTAFuncDict(ta=tae_runner,
                                            stats=self.stats,
                                            run_obj=scenario.run_obj,
+                                           memory_limit=scenario.memory_limit,
                                            runhistory=runhistory,
                                            par_factor=scenario.par_factor)
         # Third case, if it is an ExecuteTaRun we can simply use the

diff --git a/smac/initial_design/multi_config_initial_design.py b/smac/initial_design/multi_config_initial_design.py
@@ -95,9 +95,9 @@ def run(self) -> Configuration:
             # (also not on the incumbent)
             # therefore, at least two different configurations have to be in <configs>
             inc, inc_perf = self.intensifier.intensify(challengers=set(configs[1:]),
-                                                   incumbent=configs[0],
-                                                   run_history=self.runhistory,
-                                                   aggregate_func=self.aggregate_func)
+                                                       incumbent=configs[0],
+                                                       run_history=self.runhistory,
+                                                       aggregate_func=self.aggregate_func)
 
         else:
             self.logger.debug("All initial challengers are identical")

diff --git a/smac/intensification/intensification.py b/smac/intensification/intensification.py
@@ -116,13 +116,15 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
             raise ValueError("time_bound must be >= 0.01")
 
         num_run = 0
+        chall_indx = 0
 
         # Line 1 + 2
-        for chall_indx, challenger in enumerate(challengers):
+        for challenger in challengers:
             if challenger == incumbent:
                 self.logger.warning(
                     "Challenger was the same as the current incumbent; Skipping challenger")
                 continue
+
             self.logger.debug("Intensify on %s", challenger)
             if hasattr(challenger, 'origin'):
                 self.logger.debug(
@@ -163,11 +165,12 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
                     next_instance = self.rs.choice(list(available_insts))
                     # Line 7
                     self.logger.debug("Add run of incumbent")
-                    status, cost, dur, res = self.tae_runner.start(config=incumbent,
-                                                            instance=next_instance,
-                                                            seed=next_seed,
-                                                            cutoff=self.cutoff,
-                                                            instance_specific=self.instance_specifics.get(next_instance, "0"))
+                    status, cost, dur, res = self.tae_runner.start(
+                        config=incumbent,
+                        instance=next_instance,
+                        seed=next_seed,
+                        cutoff=self.cutoff,
+                        instance_specific=self.instance_specifics.get(next_instance, "0"))
 
                     num_run += 1
                 else:
@@ -179,6 +182,10 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
 
             inc_inst_seeds = set(run_history.get_runs_for_config(incumbent))
             inc_perf = aggregate_func(incumbent, run_history, inc_inst_seeds)
+
+            # at least one run of challenger
+            # to increase chall_indx counter 
+            first_run = False 
 
             # Line 9
             while True:
@@ -187,7 +194,7 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
 
                 # Line 10
                 missing_runs = list(inc_inst_seeds - chall_inst_seeds)
-
+                
                 # Line 11
                 self.rs.shuffle(missing_runs)
                 to_run = missing_runs[:min(N, len(missing_runs))]
@@ -219,13 +226,18 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
 
                     else:
                         cutoff = self.cutoff
-
+
+                    if not first_run:
+                        first_run = True
+                        chall_indx += 1
+
                     self.logger.debug("Add run of challenger")
-                    status, cost, dur, res = self.tae_runner.start(config=challenger,
-                                                            instance=instance,
-                                                            seed=seed,
-                                                            cutoff=cutoff,
-                                                            instance_specific=self.instance_specifics.get(instance, "0"))
+                    status, cost, dur, res = self.tae_runner.start(
+                        config=challenger,
+                        instance=instance,
+                        seed=seed,
+                        cutoff=cutoff,
+                        instance_specific=self.instance_specifics.get(instance, "0"))
                     num_run += 1
 
                 # we cannot use inst_seed_pairs here since we could have less runs
@@ -253,7 +265,7 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
 
                     n_samples = len(inst_seed_pairs)
                     self.logger.info("Challenger (%.4f) is better than incumbent (%.4f) on %d runs." % (
-                        chal_perf / n_samples, inc_perf / n_samples, n_samples))
+                        chal_perf, inc_perf, n_samples))
                     self.logger.info(
                         "Changing incumbent to challenger: %s" % (challenger))
                     incumbent = challenger
@@ -268,11 +280,11 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
                     # challenger is not worse, continue
                     N = 2 * N
 
-            if chall_indx >= 1 and num_run > self.run_limit:
+            if chall_indx > 1 and num_run > self.run_limit:
                 self.logger.debug(
                     "Maximum #runs for intensification reached")
                 break
-            elif chall_indx >= 1 and time.time() - self.start_time - time_bound >= 0:
+            elif chall_indx > 1 and time.time() - self.start_time - time_bound >= 0:
                 self.logger.debug("Timelimit for intensification reached ("
                                   "used: %f sec, available: %f sec)" % (
                                       time.time() - self.start_time, time_bound))
@@ -283,5 +295,7 @@ def intensify(self, challengers, incumbent, run_history, aggregate_func,
         inc_perf = aggregate_func(incumbent, run_history, inc_runs)
         self.logger.info("Updated estimated performance of incumbent on %d runs: %.4f" % (
             len(inc_runs), inc_perf))
+
+        self.stats.update_average_configs_per_intensify(n_configs=chall_indx)
 
         return incumbent, inc_perf
diff --git a/smac/runhistory/runhistory.py b/smac/runhistory/runhistory.py
@@ -18,6 +18,9 @@
 RunKey = collections.namedtuple(
     'RunKey', ['config_id', 'instance_id', 'seed'])
 
+InstSeedKey = collections.namedtuple(
+    'InstSeedKey', ['instance', 'seed'])
+
 RunValue = collections.namedtuple(
     'RunValue', ['cost', 'time', 'status', 'additional_info'])
 
@@ -41,6 +44,10 @@ def __init__(self, aggregate_func):
         # order as it was added.
         self.data = collections.OrderedDict()
 
+        # for fast access, we have also an unordered data structure
+        # to get all instance seed pairs of a configuration
+        self._configid_to_inst_seed = {}
+
         self.config_ids = {}  # config -> id
         self.ids_config = {}  # id -> config
         self._n_id = 0
@@ -88,8 +95,14 @@ def add(self, config, cost, time,
 
         k = RunKey(config_id, instance_id, seed)
         v = RunValue(cost, time, status, additional_info)
-
         self.data[k] = v
+
+        # also add to fast data structure
+        is_k = InstSeedKey(instance_id, seed)
+        self._configid_to_inst_seed[
+            config_id] = self._configid_to_inst_seed.get(config_id, [])
+        self._configid_to_inst_seed[config_id].append(is_k)
+
         # assumes an average across runs as cost function
         self.incremental_update_cost(config, cost)
 
@@ -156,6 +169,10 @@ def incremental_update_cost(self, config: Configuration, cost: float):
         self.runs_per_config[config_id] = n_runs + 1
 
     def get_cost(self, config):
+        '''
+            returns empirical cost for a configuration;
+            uses  self.cost_per_config
+        '''
         config_id = self.config_ids[config]
         return self.cost_per_config[config_id]
 
@@ -170,18 +187,12 @@ def get_runs_for_config(self, config):
         ----------
             list: tuples of instance, seed
         """
-        InstanceSeedPair = collections.namedtuple("InstanceSeedPair",
-                                                  ["instance", "seed"])
         config_id = self.config_ids.get(config)
-        list_ = []
-        for k in self.data:
-            # TA will return ABORT if config. budget was exhausted and
-            # we don't want to collect such runs to compute the cost of a
-            # configuration
-            if config_id == k.config_id and self.data[k].status not in [StatusType.ABORT]:
-                ist = InstanceSeedPair(k.instance_id, k.seed)
-                list_.append(ist)
-        return list_
+        is_list = self._configid_to_inst_seed.get(config_id)
+        if is_list is None:
+            return []
+        else:
+            return is_list
 
     def empty(self):
         """
@@ -240,9 +251,15 @@ def load_json(self, fn, cs):
 
         self._n_id = len(self.config_ids)
 
-        self.data = {RunKey(int(k[0]), k[1], int(k[2])):
-                     RunValue(float(v[0]), float(v[1]), v[2], v[3])
-                     for k, v in all_data["data"]}
+        # important to use add method to use all data structure correctly
+        for k, v in all_data["data"]:
+            self.add(config=self.ids_config[int(k[0])],
+                     cost=float(v[0]),
+                     time=float(v[1]),
+                     status=v[2],
+                     instance_id=k[1],
+                     seed=int(k[2]),
+                     additional_info=v[3])
 
     def update_from_json(self, fn, cs):
         """Update the current runhistory by adding new runs from a json file.

diff --git a/smac/scenario/scenario.py b/smac/scenario/scenario.py
@@ -317,7 +317,7 @@ def extract_instance_specific(instance_list):
                 #PCA
                 pca = PCA(n_components=self.PCA_DIM)
                 self.feature_array = pca.fit_transform(X)
-                self.n_features = self.PCA_DIM
+                self.n_features = self.feature_array.shape[1]
                 # update feature dictionary
                 for feat, inst_ in zip(self.feature_array, self.train_insts):
                     self.feature_dict[inst_] = feat