gaml
gamlAlgorithms.hpp
1 #pragma once
2 
3 /*
4  * Copyright (C) 2012, Supelec
5  *
6  * Author : Hervé Frezza-Buet, Frédéric Pennerath
7  *
8  * Contributor :
9  *
10  * This library is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License (GPL) as published by the Free Software Foundation; either
13  * version 3 of the License, or any later version.
14  *
15  * This library is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this library; if not, write to the Free Software
22  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23  *
24  * Contact : herve.frezza-buet@supelec.fr, frederic.pennerath@supelec.fr
25  *
26  */
27 
28 #include <iostream>
29 #include <string>
30 #include <vector>
31 #include <map>
32 #include <list>
33 #include <array>
34 #include <tuple>
35 #include <stdexcept>
36 #include <iterator>
37 #include <utility>
38 #include <numeric>
39 #include <algorithm>
40 #include <cmath>
41 
42 #include <cstdlib>
43 #include <gamlMerge.hpp>
44 #include <gamlPartition.hpp>
45 #include <gamlMap.hpp>
46 #include <iostream>
47 #include <iomanip>
48 #include <random>
49 #include <type_traits>
50 
51 namespace gaml {
52 
53 
54  namespace concepts {
55 
56  typedef int any;
60  class Predictor {
61  public:
62 
63  typedef any input_type;
64  typedef any output_type;
65 
66  Predictor(const Predictor& other);
67  Predictor& operator=(const Predictor& other);
68  output_type operator()(const input_type& x) const;
69  };
70 
74  class Learner {
75  public:
76 
77  typedef any predictor_type;
78 
79  Learner(const Learner& other);
80  Learner& operator=(const Learner& other);
81 
82  template<typename DataIterator, typename InputOf, typename OutputOf>
83  predictor_type operator()(const DataIterator& begin, const DataIterator& end,
84  const InputOf&, const OutputOf&) const;
85  };
86 
87 
92  public:
93 
94  typedef double value_type;
95 
97 
98  template<typename Predictor, typename DataIterator, typename InputOf, typename OutputOf>
99  value_type operator()(const Predictor& predictor,const DataIterator& begin, const DataIterator& end,
100  const InputOf&, const OutputOf&) const;
101  };
102 
103 
108  public:
109 
110  typedef double value_type;
111 
112  LearnerEvaluator(const LearnerEvaluator& other);
113 
114  template<typename Learner, typename DataIterator, typename InputOf, typename OutputOf>
115  value_type operator()(const Learner& learner,const DataIterator& begin, const DataIterator& end,
116  const InputOf&, const OutputOf&) const;
117  };
118 
119  }
120 
121  namespace risk {
122 
123  template<typename Predictor,typename DataIterator,
124  typename InputOf, typename OutputOf,
125  typename Loss, typename AccumIterator>
126  double accumulation(const Predictor& predictor,
127  const DataIterator& begin, const DataIterator& end,
128  const InputOf& inputOf,const OutputOf& outputOf,
129  const Loss& loss, AccumIterator& acc) {
130  for(DataIterator it = begin; it != end; ++it)
131  *acc++ = loss(predictor(inputOf(*it)),
132  outputOf(*it));
133  return (double)(acc());
134  }
135 
136  template<typename LOSS>
137  class Empirical {
138  private:
139 
140  LOSS loss;
141 
142  public:
143 
144  Empirical(const LOSS& l) : loss(l) {}
145  Empirical(const Empirical& other) : loss(other.loss) {}
146 
147  template<typename Predictor, typename DataIterator, typename InputOf, typename OutputOf>
148  double operator()(const Predictor& predictor,const DataIterator& begin, const DataIterator& end,
149  const InputOf& inputOf, const OutputOf& outputOf) const {
150  unsigned int nb = 0;
151  double sum = 0;
152  for(DataIterator it = begin; it != end; ++it) {
153  auto& data = *it;
154  sum += loss(predictor(inputOf(data)),
155  outputOf(data));
156  ++nb;
157  }
158 
159  return sum/nb;
160  }
161  };
162 
163  template<typename LOSS>
164  Empirical<LOSS> empirical(const LOSS& loss) {return Empirical<LOSS>(loss);}
165 
166 
167  template<typename LOSS, typename PARTITION>
169  private:
170 
171  LOSS loss;
172  PARTITION partition;
173  bool verbose;
174 
175  public:
176 
177 
178  CrossValidation(const LOSS& l, const PARTITION& part,bool verbosity)
179  : loss(l), partition(part), verbose(verbosity) {}
180  CrossValidation(const CrossValidation& other) : loss(other.loss), partition(other.partition), verbose(other.verbose) {}
181 
182  template<typename Learner, typename DataIterator, typename InputOf, typename OutputOf>
183  double operator()(const Learner& learner,const DataIterator& begin, const DataIterator& end,
184  const InputOf& inputOf, const OutputOf& outputOf) const {
185 
186  auto built_partition = partition.build(begin,end);
187  double sum = 0;
188 
189  if(verbose)
190  std::cout << "Splitting the database into " << built_partition.size() << " sets." << std::endl;
191  for(unsigned int i = 0; i < built_partition.size(); ++i) {
192 
193  auto begin = built_partition.begin(i);
194  auto end = built_partition.end(i);
195  auto _begin = built_partition.complement_begin(i);
196  auto _end = built_partition.complement_end(i);
197 
198  if(verbose)
199  std::cout << std::setw(6) << i+1 << '/' << built_partition.size()
200  << " : learning...\r" << std::flush;
201 
202  auto predictor = learner(_begin, _end, inputOf, outputOf);
203 
204 
205  double risk=0;
206  for(DataIterator it = begin; it != end; ++it) {
207  auto& data = *it;
208  risk += loss(predictor(inputOf(data)),
209  outputOf(data));
210  }
211 
212  if(verbose) {
213  auto size = std::distance(begin,end);
214  std::cout << std::setw(6) << i+1 << '/' << built_partition.size()
215  << " : risk = " << risk / size << " ("
216  << size << "-sized test set)" << std::endl;
217  }
218 
219  sum += risk;
220  }
221 
222  return (double)(sum/(double)(built_partition.data_size()));
223  }
224  };
225 
226  template<typename LOSS, typename PARTITION>
227  CrossValidation<LOSS,PARTITION> cross_validation(const LOSS& l, const PARTITION& part,bool verbosity) {
228  return CrossValidation<LOSS,PARTITION>(l,part,verbosity);
229  }
230 
231  }
232 
236  class integer {
237  private:
238  int j;
239  public:
240 
241  using difference_type = long;
242  using value_type = int;
243  using pointer = value_type*;
244  using reference = value_type&;
245  using iterator_category = std::random_access_iterator_tag;
246 
247 
248  integer() : j(0) {}
249  integer(const integer& cp) : j(cp.j) {}
250  integer(int i) : j(i) {}
251  integer& operator=(int i) {j=i; return *this;}
252  integer& operator=(const integer& cp) {j=cp.j; return *this;}
253  integer& operator++() {++j; return *this;}
254  integer& operator--() {--j; return *this;}
255  integer& operator+=(int diff) {j+=diff; return *this;}
256  integer& operator-=(int diff) {j-=diff; return *this;}
257  integer operator++(int) {integer res = *this; ++*this; return res;}
258  integer operator--(int) {integer res = *this; --*this; return res;}
259  int operator-(const integer& i) const {return j - i.j;}
260  integer operator+(int i) const {return integer(j+i);}
261  integer operator-(int i) const {return integer(j-i);}
262  const int& operator*() const {return j;}
263  bool operator==(const integer& i) const {return j == i.j;}
264  bool operator!=(const integer& i) const {return j != i.j;}
265  };
266 
267  namespace functor {
268  class average {
269  public:
270  typedef double output_type;
271  template<typename DataIterator,typename ValueOf>
272  double operator()(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) const {
273  if(begin == end)
274  throw std::runtime_error("Average called on an empty collection");
275 
276  // std::accumulate does not work... Is there a problem in map iterator définition ?
277 
278  // auto values = gaml::map(begin,end,value_of);
279  // return (Y)(std::accumulate(values.begin(),values.end(),0)/(double)nb);
280 
281  // The version without std::accumulate does not require nb.
282  double sum = 0;
283  unsigned int nb = 0;
284  for(auto it = begin; it != end; ++it, ++nb) sum += (double)(value_of(*it));
285  return sum/(double)nb;
286  }
287  };
288  }
289 
294  template<typename DataIterator,typename ValueOf>
295  double average(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) {
296  functor::average avg;
297  return avg(begin,end,value_of);
298  }
299 
300  namespace functor {
301  class variance {
302  public:
303  typedef double output_type;
304  template<typename DataIterator,typename ValueOf>
305  double operator()(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) const {
306  double n = 0;
307  double mean = 0;
308  double M2 = 0;
309  for(auto it = begin; it != end; ++it) {
310  double x = (double)(value_of(*it));
311  double delta = x - mean;
312  mean = mean + delta/(++n);
313  M2 = M2 + delta*(x - mean);
314  }
315  if (n < 2) return 0;
316  return M2/(n - 1);
317  }
318  };
319  }
320 
325  template<typename DataIterator,typename ValueOf>
326  double variance(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) {
327  functor::variance var;
328  return var(begin,end,value_of);
329  }
330 
331  namespace by_default {
332  template<typename VALUE>
333  struct LesserThan {
334  bool operator()(const VALUE& v1, const VALUE& v2) const {
335  return v1 < v2;
336  }
337  };
338  }
339 
340  namespace functor {
341  template<typename VALUE, typename COMP = by_default::LesserThan<VALUE> >
342  class frequencies {
343  public:
344  template<typename DataIterator, typename ValueOf>
345  std::map<VALUE,double,COMP> operator()(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) const {
346 
347  if(begin == end)
348  throw std::runtime_error("Frequencies called on an empty collection");
349 
350  std::map<VALUE,double,COMP> frequencies;
351  for(auto it = begin; it != end; ++it) {
352  auto value = value_of(*it);
353  auto mapit = frequencies.find(value);
354  if(mapit == frequencies.end())
355  frequencies[value] = 1;
356  else
357  ++(mapit->second);
358  }
359 
360  double size = (double)(std::distance(begin,end));
361  for(auto& kv : frequencies) kv.second /= size;
362  return frequencies;
363  }
364  };
365  }
366 
370  template<typename VALUE, typename COMP = by_default::LesserThan<VALUE>, typename DataIterator, typename ValueOf>
371  std::map<VALUE,double,COMP> frequencies(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) {
373  return f(begin,end,value_of);
374  }
375 
380  template<typename Map>
381  typename Map::key_type most_frequent(const Map& frequencies) {
382 
383  auto it = std::max_element(frequencies.begin(), frequencies.end(),
384  [](const std::pair<typename Map::key_type,double>& e1,
385  const std::pair<typename Map::key_type,double>& e2) -> bool {return e1.second < e2.second;});
386  return it->first;
387  }
388 
389 
390  namespace functor {
391  template<typename VALUE, typename COMP = by_default::LesserThan<VALUE>>
393  public:
394  typedef VALUE output_type;
395  template<typename DataIterator, typename ValueOf>
396  VALUE operator()(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) const {
397  return (VALUE)(gaml::most_frequent(gaml::frequencies<VALUE,COMP>(begin,end,value_of)));
398  }
399  };
400  }
401 
406  template<typename VALUE, typename COMP = by_default::LesserThan<VALUE>, typename DataIterator, typename ValueOf>
407  VALUE most_frequent(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) {
409  return mf(begin,end,value_of);
410  }
411 
412 
413  namespace functor {
414  template<typename VALUE>
416  public:
417  typedef VALUE output_type;
418  template<typename DataIterator, typename ValueOf>
419  VALUE operator()(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) const {
420  auto it = begin;
421  auto cumulated_frequencies = value_of(*it);
422  for(++it; it != end; ++it)
423  for(auto& kv : value_of(*it)) {
424  auto fit = cumulated_frequencies.find(kv.first);
425  if(fit == cumulated_frequencies.end())
426  cumulated_frequencies[kv.first] = kv.second;
427  else
428  fit->second += kv.second;
429  }
430  return (VALUE)(gaml::most_frequent(cumulated_frequencies));
431  }
432  };
433  }
434 
439  template<typename VALUE, typename DataIterator, typename ValueOf>
440  VALUE highest_cumulated_frequency(const DataIterator& begin, const DataIterator& end, const ValueOf& value_of) {
442  return hcf(begin,end,value_of);
443  }
444 
448  template<typename Class, typename DataIterator, typename ClassComp = by_default::LesserThan<Class>, typename ClassOf>
449  double classification_entropy(const DataIterator& begin, const DataIterator& end, const ClassOf& class_of) {
450  auto freq = gaml::frequencies<Class,ClassComp>(begin,end,class_of);
451  double H = 0;
452  for(auto& kv : freq) {
453  double p = kv.second;
454  H -= p*std::log2(p);
455  }
456  return H;
457  }
458 }
gaml::functor::frequencies
Definition: gamlAlgorithms.hpp:342
gaml::functor::highest_cumulated_frequency
Definition: gamlAlgorithms.hpp:415
gaml::concepts::LearnerEvaluator
Evaluation of a predictor on a data set.
Definition: gamlAlgorithms.hpp:107
gaml::risk::Empirical
Definition: gamlAlgorithms.hpp:137
gaml::functor::average
Definition: gamlAlgorithms.hpp:268
gaml::by_default::LesserThan
Definition: gamlAlgorithms.hpp:333
gaml::integer
Definition: gamlAlgorithms.hpp:236
gaml::risk::CrossValidation
Definition: gamlAlgorithms.hpp:168
gaml::concepts::Learner
This learns from a data set.
Definition: gamlAlgorithms.hpp:74
gaml::concepts::Predictor
This predicts a label from an input.
Definition: gamlAlgorithms.hpp:60
gaml::functor::variance
Definition: gamlAlgorithms.hpp:301
gaml::functor::most_frequent
Definition: gamlAlgorithms.hpp:392
gaml::concepts::PredictorEvaluator
Evaluation of a predictor on a data set.
Definition: gamlAlgorithms.hpp:91