KSquare Utilities
CrossValidation.cpp
Go to the documentation of this file.
1 #include "FirstIncludes.h"
2 #include <stdio.h>
3 #include <iomanip>
4 #include <string>
5 #include <iostream>
6 #include <fstream>
7 #include <vector>
8 #include "MemoryDebug.h"
9 using namespace std;
10 
11 
12 #include "KKBaseTypes.h"
13 #include "OSservices.h"
14 #include "RunLog.h"
15 using namespace KKB;
16 
17 
18 #include "CrossValidation.h"
19 #include "Classifier2.h"
20 #include "ConfusionMatrix2.h"
21 #include "FactoryFVProducer.h"
22 #include "FileDesc.h"
23 #include "MLClass.h"
24 #include "FeatureVector.h"
27 #include "TrainingProcess2.h"
28 using namespace KKMLL;
29 
30 
31 
32 CrossValidation::CrossValidation (TrainingConfiguration2Ptr _config,
33  FeatureVectorListPtr _examples,
34  MLClassListPtr _mlClasses,
35  kkint32 _numOfFolds,
36  bool _featuresAreAlreadyNormalized,
37  FileDescPtr _fileDesc,
38  RunLog& _log,
39  bool& _cancelFlag
40  ):
41 
42  cancelFlag (_cancelFlag),
43  config (_config),
44  duplicateTrainDataCount (0),
45  featuresAreAlreadyNormalized (_featuresAreAlreadyNormalized),
46  fileDesc (_fileDesc),
47  foldAccuracies (),
48  foldCounts (),
49  fvProducerFactory (NULL),
50  confusionMatrix (NULL),
51  cmByNumOfConflicts (NULL),
52  examples (NULL),
53  mlClasses (_mlClasses),
54  imagesPerClass (0),
55  maxNumOfConflicts (0),
56  numOfFolds (_numOfFolds),
57  numSVs (0),
58  totalNumSVs (0),
59  numOfWinnersCounts (NULL),
60  numOfWinnersCorrects (NULL),
61  numOfWinnersOneOfTheWinners (NULL),
62  testTime (0.0),
63  trainingTime (0.0),
64 
65  accuracyMean (0.0f),
66  accuracyStdDev (0.0f),
67 
68  avgPredProb (0.0),
69  totalPredProb (0.0),
70 
71  supportPointsMean (0.0f),
72  supportPointsStdDev (0.0f),
73  supportPoints (),
74 
75  testTimeMean (0.0),
76  testTimeStdDev (0.0),
77  testTimes (),
78 
79  trainTimeMean (0.0),
80  trainTimeStdDev (0.0),
81  trainTimes (),
82 
83  weOwnConfusionMatrix (false)
84 
85 {
86  fvProducerFactory = config->FvFactoryProducer (_log);
87  examples = _examples->ExtractExamplesForClassList (mlClasses);
88  if (config)
89  imagesPerClass = config->ExamplesPerClass ();
90  else
91  imagesPerClass = -1;
92 }
93 
94 
95 
97 {
98  DeleteAllocatedMemory ();
99  delete examples; examples = NULL;
100 }
101 
102 
103 
104 
105 void CrossValidation::AllocateMemory ()
106 {
107  maxNumOfConflicts = mlClasses->QueueSize () + 1;
108  confusionMatrix = new ConfusionMatrix2 (*mlClasses);
109  weOwnConfusionMatrix = true;
110  cmByNumOfConflicts = new ConfusionMatrix2Ptr[maxNumOfConflicts];
111 
112  numOfWinnersCounts = new kkint32[maxNumOfConflicts];
113  numOfWinnersCorrects = new kkint32[maxNumOfConflicts];
114  numOfWinnersOneOfTheWinners = new kkint32[maxNumOfConflicts];
115 
116  kkint32 conflictIDX;
117 
118  for (conflictIDX = 0; conflictIDX < maxNumOfConflicts; conflictIDX++)
119  {
120  cmByNumOfConflicts [conflictIDX] = new ConfusionMatrix2 (*mlClasses);
121  numOfWinnersCounts [conflictIDX] = 0;
122  numOfWinnersCorrects [conflictIDX] = 0;
123  numOfWinnersOneOfTheWinners [conflictIDX] = 0;
124  }
125 
126  //foldAccuracies = new float[numOfFolds]; // Changed to vector<float> aka VectorFloat
127  //foldCounts = new kkint32[numOfFolds]; // Changed to vector<kkint32> aka VectorInt
128 } /* AllocateMemory */
129 
130 
131 
132 
133 void CrossValidation::DeleteAllocatedMemory ()
134 {
135  if (weOwnConfusionMatrix)
136  {
137  delete confusionMatrix;
138  confusionMatrix = NULL;
139  }
140 
141  kkint32 conflictIDX;
142 
143  if (cmByNumOfConflicts)
144  {
145  for (conflictIDX = 0; conflictIDX < maxNumOfConflicts; conflictIDX++)
146  {
147  delete cmByNumOfConflicts[conflictIDX];
148  cmByNumOfConflicts[conflictIDX] = NULL;
149  }
150 
151  delete cmByNumOfConflicts;
152  cmByNumOfConflicts = NULL;
153  }
154 
155  // delete foldAccuracies; foldAccuracies = NULL;
156  // delete foldCounts; foldCounts = NULL;
157 
158  delete numOfWinnersCounts; numOfWinnersCounts = NULL;
159  delete numOfWinnersCorrects; numOfWinnersCorrects = NULL;
160  delete numOfWinnersOneOfTheWinners; numOfWinnersOneOfTheWinners = NULL;
161 
162 } /* DeleteAllocatedMemory */
163 
164 
165 
167 {
168  log.Level (10) << "CrossValidation::RunCrossValidation numOfFolds[" << numOfFolds << "]" << endl;
169 
170  if (numOfFolds < 1)
171  {
172  log.Level (-1) << endl
173  << "CrossValidation::RunCrossValidation **** ERROR ****" << endl
174  << endl
175  << " Invalid numOfFolds[" << numOfFolds << "]." << endl
176  << endl;
177  return;
178  }
179 
180  DeleteAllocatedMemory ();
181  AllocateMemory ();
182 
183  kkint32 imageCount = examples->QueueSize ();
184  kkint32 numImagesPerFold = (imageCount + numOfFolds - 1) / numOfFolds;
185  kkint32 firstInGroup = 0;
186 
187  totalPredProb = 0.0;
188 
189 
190  kkint32 foldNum;
191 
192  for (foldNum = 0; foldNum < numOfFolds; foldNum++)
193  {
194  kkint32 lastInGroup;
195 
196  // If We are doing the last Fold Make sure that we are including all the examples
197  // that have not been tested.
198  if (foldNum == (numOfFolds - 1))
199  lastInGroup = imageCount;
200  else
201  lastInGroup = firstInGroup + numImagesPerFold - 1;
202 
203 
204  log.Level (20) << "Fold [" << (foldNum + 1) << "] of [" << numOfFolds << "]" << endl;
205 
206  FeatureVectorListPtr trainingExamples = examples->ManufactureEmptyList (true);
207  FeatureVectorListPtr testImages = examples->ManufactureEmptyList (true);
208 
209  log.Level (30) << "Fold Num[" << foldNum << "] "
210  << "FirstTestImage[" << firstInGroup << "] "
211  << "LastInGroup[" << lastInGroup << "]."
212  << endl;
213 
214  for (kkint32 x = 0; (x < imageCount) && (!cancelFlag); x++)
215  {
216  FeatureVectorPtr newImage = examples->IdxToPtr (x)->Duplicate ();
217  if ((x >= firstInGroup) && (x <= lastInGroup))
218  {
219  testImages->PushOnBack (newImage);
220  }
221  else
222  {
223  trainingExamples->PushOnBack (newImage);
224  }
225  }
226 
227  log.Level (20) << "Number Of Training Images : " << trainingExamples->QueueSize () << endl;
228  log.Level (20) << "Number Of Test Images : " << testImages->QueueSize () << endl;
229 
230  if (cancelFlag)
231  break;
232 
233  CrossValidate (testImages, trainingExamples, foldNum, NULL, log);
234 
235  delete trainingExamples; trainingExamples = NULL;
236  delete testImages; testImages = NULL;
237 
238  firstInGroup = firstInGroup + numImagesPerFold;
239  }
240 
241  if (!cancelFlag)
242  {
243  avgPredProb = totalPredProb / imageCount;
244 
245  CalcMeanAndStdDev (foldAccuracies, accuracyMean, accuracyStdDev);
246  CalcMeanAndStdDev (supportPoints, supportPointsMean, supportPointsStdDev);
247  CalcMeanAndStdDev (testTimes, testTimeMean, testTimeStdDev);
248  CalcMeanAndStdDev (trainTimes, trainTimeMean, trainTimeStdDev);
249  }
250 } /* RunCrossValidation */
251 
252 
253 
254 
255 
256 void CrossValidation::RunValidationOnly (FeatureVectorListPtr validationData,
257  bool* classedCorrectly,
258  RunLog& log
259  )
260 {
261  log.Level (10) << "CrossValidation::RunValidationOnly" << endl;
262  DeleteAllocatedMemory ();
263  AllocateMemory ();
264 
265  totalPredProb = 0.0;
266 
267  // We need to get a duplicate copy of each image data because the trainer and classifier
268  // will normalize the data.
269  FeatureVectorListPtr trainingExamples = examples->DuplicateListAndContents ();
270  FeatureVectorListPtr testImages = validationData->DuplicateListAndContents ();
271 
272  CrossValidate (testImages, trainingExamples, 0, classedCorrectly, log);
273 
274  if (testImages->QueueSize () > 0)
275  avgPredProb = totalPredProb / testImages->QueueSize ();
276  else
277  avgPredProb = 0.0f;
278 
279  delete trainingExamples; trainingExamples = NULL;
280  delete testImages; testImages = NULL;
281 
282 
283  if (!cancelFlag)
284  {
285  CalcMeanAndStdDev (foldAccuracies, accuracyMean, accuracyStdDev);
286  CalcMeanAndStdDev (supportPoints, supportPointsMean, supportPointsStdDev);
287  CalcMeanAndStdDev (testTimes, testTimeMean, testTimeStdDev);
288  CalcMeanAndStdDev (trainTimes, trainTimeMean, trainTimeStdDev);
289  }
290 } /* RunValidationOnly */
291 
292 
293 
294 
295 void CrossValidation::CrossValidate (FeatureVectorListPtr testImages,
296  FeatureVectorListPtr trainingExamples,
297  kkint32 foldNum,
298  bool* classedCorrectly,
299  RunLog& log
300  )
301 {
302  log.Level (20) << "CrossValidation::CrossValidate FoldNum[" << foldNum << "]." << endl;
303 
304  bool cancelFlag = false;
305 
306  KKStr statusMessage;
307 
309  (config,
310  trainingExamples,
311  false, /**< false = DON'T take ownership of 'trainingExamples'. */
312  featuresAreAlreadyNormalized,
313  cancelFlag,
314  log
315  );
316  if (trainer->Abort ())
317  return;
318 
319  duplicateTrainDataCount += trainer->DuplicateDataCount ();
320  trainingTime += trainer->TrainingTime ();
321 
322  kkint32 foldNumSVs = 0;
323  kkint32 foldTotalNumSVs = 0;
324  trainer->SupportVectorStatistics (foldNumSVs, foldTotalNumSVs);
325  numSVs += foldNumSVs;
326  totalNumSVs += foldTotalNumSVs;
327 
328  log.Level (20) << "CrossValidate Creating Classification Object" << endl;
329 
330  Classifier2 classifier (trainer, log);
331  {
332  // Make sure that a Noise class exists
333  mlClasses->GetNoiseClass ();
334  }
335 
336  log.Level (20) << "CrossValidate Classifying Test Images." << endl;
337 
338  double breakTie = 0.0f;
339  FeatureVectorPtr example = NULL;
340  MLClassPtr knownClass = NULL;
341  bool knownClassOneOfTheWinners = false;
342  kkint32 numOfWinners = 0;
343  MLClassPtr predictedClass = NULL;
344  double probability = 0.0f;
345 
346  kkint32 numTestExamples = testImages->QueueSize ();
347 
348  kkint32 foldCorrect = 0;
349  kkint32 foldCount = 0;
350 
351  vector<FeatureVectorPtr> exampleHist (numTestExamples);
352  vector<MLClassPtr> knownClassHist (numTestExamples);
353  vector<bool> knownClassOneOfTheWinnersHist (numTestExamples, false);
354  vector<kkint32> numOfWinersHist (numTestExamples, 0);
355  vector<MLClassPtr> predictedClassHist (numTestExamples);
356  vector<double> probabilityHist (numTestExamples, 0.0f);
357 
358  FeatureVectorList::iterator fvIDX;
359 
360  double startClassificationTime = osGetSystemTimeUsed ();
361 
362  for (fvIDX = testImages->begin (); (fvIDX != testImages->end ()) && (!cancelFlag); fvIDX++)
363  {
364  example = *fvIDX;
365 
366  knownClass = example->MLClass ();
367 
368  predictedClass = classifier.ClassifyAExample (*example,
369  probability,
370  numOfWinners,
371  knownClassOneOfTheWinners,
372  breakTie
373  );
374 
375  exampleHist [foldCount] = example;
376  knownClassHist [foldCount] = knownClass;
377  predictedClassHist [foldCount] = predictedClass;
378  probabilityHist [foldCount] = probability;
379  numOfWinersHist [foldCount] = numOfWinners;
380  knownClassOneOfTheWinnersHist [foldCount] = knownClassOneOfTheWinners;
381 
382  foldCount++;
383  }
384 
385  double endClassificationTime = osGetSystemTimeUsed ();
386  double testTimeThisFold = (endClassificationTime - startClassificationTime);
387  testTime += testTimeThisFold;
388 
389  // lets update statistics
390  foldCount = 0;
391  for (foldCount = 0; (foldCount < numTestExamples) && (!cancelFlag); foldCount++)
392  {
393  example = exampleHist [foldCount];
394  predictedClass = predictedClassHist [foldCount];
395  probability = probabilityHist [foldCount];
396  numOfWinners = numOfWinersHist [foldCount];
397  knownClass = knownClassHist [foldCount];
398  knownClassOneOfTheWinners = knownClassOneOfTheWinnersHist [foldCount];
399 
400 
401  totalPredProb += probability;
402 
403 
404  confusionMatrix->Increment (knownClass,
405  predictedClass,
406  (kkint32)(example->OrigSize ()),
407  probability,
408  log
409  );
410 
411  cmByNumOfConflicts[numOfWinners]->Increment (knownClass,
412  predictedClass,
413  (kkint32)(example->OrigSize ()),
414  probability,
415  log
416  );
417 
418  bool correctClassificationMade = false;
419  numOfWinnersCounts[numOfWinners]++;
420  if (knownClass == predictedClass)
421  {
422  correctClassificationMade = true;
423  numOfWinnersCorrects[numOfWinners]++;
424  foldCorrect++;
425  }
426 
427  if (classedCorrectly)
428  {
429  classedCorrectly[foldCount] = correctClassificationMade;
430  }
431 
432  if (knownClassOneOfTheWinners)
433  numOfWinnersOneOfTheWinners[numOfWinners]++;
434  }
435 
436  float foldAccuracy = 0.0;
437 
438  if (foldCount > 0)
439  foldAccuracy = 100.0f * (float)foldCorrect / (float)foldCount;
440 
441  foldAccuracies.push_back (foldAccuracy);
442  foldCounts.push_back (foldCount);
443 
444  supportPoints.push_back ((float)trainer->NumOfSupportVectors ());
445  trainTimes.push_back (trainer->TrainingTime ());
446  testTimes.push_back (testTimeThisFold);
447 
448  delete trainer;
449  trainer = NULL;
450 
451  log.Level (20) << "CrossValidation::CrossValidate - Done." << endl;
452 } /* CrossValidate */
453 
454 
455 
457 {
458  if (confusionMatrix)
459  return (float)confusionMatrix->Accuracy ();
460  else
461  return 0.0f;
462 } /* Accuracy */
463 
464 
465 
467 {
468  if (confusionMatrix)
469  return (float)confusionMatrix->AccuracyNorm ();
470  else
471  return 0.0f;
472 } /* Accuracy */
473 
474 
475 
476 
478 {
479  KKStr foldAccuracyStr (9 * numOfFolds); // Pre Reserving enough space for all Accuracies.
480 
481  for (kkuint32 foldNum = 0; foldNum < foldAccuracies.size (); foldNum++)
482  {
483  if (foldNum > 0)
484  foldAccuracyStr << "\t";
485  foldAccuracyStr << StrFormatDouble (foldAccuracies[foldNum], "ZZ,ZZ0.00%");
486  }
487 
488  return foldAccuracyStr;
489 } /* FoldAccuracysToStr */
490 
491 
492 
493 
494 
495 float CrossValidation::FoldAccuracy (kkint32 foldNum) const
496 {
497  if ((foldNum < 0) || (foldNum >= (kkint32)foldAccuracies.size ()))
498  {
499  return 0.0f;
500  }
501 
502  return foldAccuracies[foldNum];
503 } /* FoldAccuracy */
504 
505 
506 
507 
509 {
510  weOwnConfusionMatrix = false;
511  return confusionMatrix;
512 }
void SupportVectorStatistics(kkint32 &numSVs, kkint32 &totalNumSVs)
KKStr(kkint32 size)
Creates a KKStr object that pre-allocates space for &#39;size&#39; characters.
Definition: KKStr.cpp:655
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
ConfusionMatrix2Ptr GiveMeOwnershipOfConfusionMatrix()
__int32 kkint32
Definition: KKBaseTypes.h:88
FeatureVector * FeatureVectorPtr
Definition: Model.h:44
MLClassPtr GetNoiseClass() const
Definition: MLClass.cpp:875
float OrigSize() const
The value of Feature[0] before normalization.
A class that is meant to manage a n-Fold Cross Validation.
void RunCrossValidation(RunLog &log)
virtual FeatureVectorListPtr DuplicateListAndContents() const
Creates a duplicate of list and also duplicates it contents.
virtual FeatureVectorListPtr ManufactureEmptyList(bool _owner) const
Creates an instance of a Empty FeatureVectorList.
unsigned __int32 kkuint32
Definition: KKBaseTypes.h:89
KKTHread * KKTHreadPtr
static TrainingProcess2Ptr CreateTrainingProcessFromTrainingExamples(TrainingConfiguration2Const *config, FeatureVectorListPtr trainingExamples, bool takeOwnershipOfTrainingExamples, bool featuresAlreadyNormalized, VolConstBool &cancelFlag, RunLog &log)
Will Construct an instance using provided list of examples rather than loading from training library...
KKStr FoldAccuracysToStr() const
ConfusionMatrix2(const MLClassList &_classes)
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of &#39;std::string&#39; strings.
Definition: KKStr.cpp:1082
Classifier2(TrainingProcess2Ptr _trainer, RunLog &_log)
Definition: Classifier2.cpp:42
MLClassPtr MLClass() const
Class that is example is assigned to.
TrainingProcess2 * TrainingProcess2Ptr
Definition: Classifier2.h:62
kkint32 DuplicateDataCount() const
double osGetSystemTimeUsed()
Returns the number of CPU seconds used by current process.
void Increment(MLClassPtr _knownClass, MLClassPtr _predClass, kkint32 _size, double _probability, RunLog &_log)
MLClassPtr ClassifyAExample(FeatureVector &example, double &probability, kkint32 &numOfWinners, bool &knownClassOneOfTheWinners, double &breakTie)
float FoldAccuracy(kkint32 foldNum) const
Used for logging messages.
Definition: RunLog.h:49
void EncodeProblem(const struct svm_paramater &param, struct svm_problem &prob_in, struct svm_problem &prob_out)
CrossValidation(TrainingConfiguration2Ptr _config, FeatureVectorListPtr _examples, MLClassListPtr _mlClasses, kkint32 _numOfFolds, bool _featuresAreAlreadyNormalized, FileDescPtr _fileDesc, RunLog &_log, bool &_cancelFlag)
FeatureVectorListPtr ExtractExamplesForClassList(MLClassListPtr classes)
A confusion matrix object that is used to record the results from a CrossValidation. <see also cref="CrossValidation"
void RunValidationOnly(FeatureVectorListPtr validationData, bool *classedCorrectly, RunLog &log)
FactoryFVProducerPtr FvFactoryProducer(RunLog &log) const