KSquare Utilities
FeatureFileIOC45.cpp
Go to the documentation of this file.
1 #include "FirstIncludes.h"
2 #include <ctype.h>
3 #include <limits.h>
4 #include <math.h>
5 #include <stdio.h>
6 #include <time.h>
7 #include <string.h>
8 #include <string>
9 #include <iostream>
10 #include <fstream>
11 #include <vector>
12 #include "MemoryDebug.h"
13 using namespace std;
14 
15 #include "KKBaseTypes.h"
16 #include "DateTime.h"
17 #include "OSservices.h"
18 #include "RunLog.h"
19 #include "KKStr.h"
20 using namespace KKB;
21 
22 
23 #include "FeatureFileIOC45.h"
24 #include "FileDesc.h"
25 #include "MLClass.h"
26 using namespace KKMLL;
27 
28 
29 
31 
32 
34  FeatureFileIO ("C45", true, true)
35 {
36 }
37 
38 
39 
41 {
42 }
43 
44 
45 
46 FeatureVectorListPtr FeatureFileIOC45::LoadFeatureFile
47  (const KKStr& _fileName,
48  MLClassList& _mlClasses,
49  kkint32 _maxCount,
50  VolConstBool& _cancelFlag, /**< will be monitored, if set to True Load will terminate. */
51  bool& _successful,
52  bool& _changesMade,
53  RunLog& _log
54  )
55 {
56  _log.Level (10) << "FeatureFileIOC45::LoadFeatureFile File[" << _fileName << "] FileFormat[" << DriverName () << "]" << endl;
57 
58  if (_maxCount < 0)
59  _maxCount = int32_max;
60 
61 
62  KKStr namesFileName;
63  KKStr dataFileName;
64 
65  {
66  // First determine the name of the names and data file.
67  kkint32 lastDotPos = _fileName.LocateLastOccurrence ('.');
68  if (lastDotPos < 0)
69  {
70  namesFileName = _fileName + ".names";
71  dataFileName = _fileName;
72  if (!osFileExists (dataFileName))
73  dataFileName = _fileName + ".data";
74  }
75  else
76  {
77  KKStr leadingPart = _fileName.SubStrPart (0, lastDotPos - 1);
78  KKStr extension = _fileName.SubStrPart (lastDotPos + 1);
79  extension.Upper ();
80  if (extension == "NAMES")
81  {
82  // The file name was already the names file, can use as is
83  namesFileName = _fileName;
84  dataFileName = leadingPart;
85  if (!osFileExists (dataFileName))
86  {
87  dataFileName = leadingPart + ".data";
88  if (!osFileExists (dataFileName))
89  dataFileName = leadingPart + ".test";
90  }
91  }
92 
93  else if ((extension == "DATA") || (extension == "TEST"))
94  {
95  dataFileName = _fileName;
96  namesFileName = leadingPart + ".names";
97  if (!osFileExists (namesFileName))
98  {
99  namesFileName = leadingPart + ".names";
100  if (!osFileExists (namesFileName))
101  {
102  namesFileName = _fileName + ".names";
103  if (!osFileExists (namesFileName))
104  namesFileName = leadingPart + ".names";
105  }
106  }
107  }
108 
109  else
110  {
111  dataFileName = _fileName;
112  namesFileName = _fileName + ".names";
113  }
114  }
115  }
116 
117  _changesMade = false;
118 
119  kkint32 estimatedNumOfDataItems = -1;
120 
121  _successful = true;
122 
123  ifstream namesFile (namesFileName.Str (), ios_base::in);
124  if (!namesFile.is_open ())
125  {
126  _log.Level (-1) << "FeatureFileIOC45::LoadFeatureFile ***ERROR*** Error Opening File[" << dataFileName << "]." << endl;
127  _successful = false;
128  return NULL;
129  }
130 
131  KKStr errorMessage;
132 
133  FileDescPtr fileDesc = GetFileDesc (namesFileName, namesFile, &_mlClasses, estimatedNumOfDataItems, errorMessage, _log);
134  if (fileDesc == NULL)
135  {
136  _log.Level (-1) << endl << endl
137  << "FeatureFileIOC45::LoadFeatureFile ***ERROR*** Loading Feature File[" << namesFileName << "]" << endl
138  << endl;
139  _successful = false;
140  return NULL;
141  }
142 
143  namesFile.close ();
144  fileDesc = FileDesc::GetExistingFileDesc (fileDesc);
145 
146 
147 
148  ifstream dataFile (dataFileName.Str (), ios_base::in);
149  if (!dataFile.is_open ())
150  {
151  _log.Level (-1) << "FeatureFileIOC45::LoadFeatureFile ***ERROR*** Error Opening File[" << dataFileName << "]." << endl;
152  _successful = false;
153  return NULL;
154  }
155 
156 
157  FeatureVectorListPtr examples = LoadFile (dataFileName, fileDesc, _mlClasses, dataFile, _maxCount, _cancelFlag, _changesMade, errorMessage, _log);
158  if (examples == NULL)
159  {
160  _successful = false;
161  }
162  else
163  {
164  _successful = true;
165  }
166 
167  dataFile.close ();
168 
169  return examples;
170 } /* LoadFeatureFile */
171 
172 
173 
174 
175 
176 void FeatureFileIOC45::C45StripComments (KKStr& ln)
177 {
178  kkint32 lastBarPos = ln.LocateLastOccurrence ('|');
179  if (lastBarPos >= 0)
180  ln = ln.SubStrPart (0, lastBarPos - 1);
181 
182  ln.TrimLeft (" \n\r\t");
183  ln.TrimRight (" \n\r\t");
184 } /* C45StripComments */
185 
186 
187 
188 
189 // deal with comments, white spaces, and escape characters.
190 void FeatureFileIOC45::C45StrPreProcessName (KKStr& ln)
191 {
192  ln.TrimRight ();
193  ln.TrimLeft ();
194 
195  KKStr newLine (ln.Len ());
196 
197  kkuint32 x = 0;
198  char thisChar;
199  char nextChar;
200  while (x < ln.Len ())
201  {
202  thisChar = ln[x];
203  nextChar = ln[x + 1];
204 
205  if (strchr (" \t", thisChar))
206  {
207  // We have a white space character, will want to compress into a single white space character
208  while ((x < ln.Len ()) && (strchr (" \t", nextChar)))
209  {
210  x++;
211  thisChar = nextChar;
212  nextChar = ln[x + 1];
213  }
214  }
215 
216  else if (thisChar == '\\')
217  {
218  // A possible escape sequence
219  if (strchr (",:?", nextChar))
220  {
221  x++;
222  thisChar = nextChar;
223  nextChar = ln[x + 1];
224  }
225  }
226 
227  newLine.Append (thisChar);
228 
229  x++;
230  }
231 
232  ln = newLine;
233 } /* C45StrPreProcessName */
234 
235 
236 /**
237  *@brief Locates first 'ch' in 'txt' that is not preceded by an escape character('\\').
238  */
239 kkint32 FeatureFileIOC45::C45LocateNextCharacter (const KKStr& txt,
240  char ch
241  )
242 {
243  kkuint32 x = 0;
244  while (x < txt.Len ())
245  {
246  if (txt[x] == ch)
247  {
248  if (x == 0) break;
249  if (txt[x - 1] != '\\') break;
250  }
251  x++;
252  }
253 
254  if (x < txt.Len ()) return (kkint32)x; else return -1;
255 } /* C45LocateNextCharacter */
256 
257 
258 
259 
260 
261 void FeatureFileIOC45::ProcessC45AttrStr (FileDescPtr fileDesc,
262  KKStr& attrStr,
263  bool& validStr,
264  RunLog& _log
265  )
266 {
267  validStr = true;
268 
269  attrStr.TrimLeft ();
270  attrStr.TrimRight ();
271  if (attrStr.Empty ())
272  return;
273 
274  kkint32 colPos = C45LocateNextCharacter (attrStr, ':');
275  if (colPos < 0)
276  {
277  // Missing Attribute Specification
278  _log.Level (-1) << endl
279  << "FeatureFileIOC45::ProcessC45AttrStr ***ERROR*** Missing Attribute Specification (No Colon)." << endl
280  << " attrStr[" << attrStr << "]" << endl
281  << endl;
282  validStr = false;
283  return;
284  }
285 
286  KKStr name = attrStr.SubStrPart (0, colPos - 1);
287  KKStr typeStr = attrStr.SubStrPart (colPos + 1);
288  C45StrPreProcessName (name);
289 
290  if (name.Empty ())
291  {
292  _log.Level (-1) << endl
293  << "FeatureFileIOC45::ProcessC45AttrStr ***ERROR*** Field Name is Empty" << endl
294  << " AttrStr[" << attrStr << "]" << endl
295  << endl;
296  validStr = false;
297  return;
298  }
299 
300  // Lets determine type now
301  typeStr.TrimLeft ();
302  typeStr.TrimRight ();
303  KKStr typeStrUpper = typeStr.ToUpper ();
304 
306 
307  if (typeStrUpper == "CONTINUOUS")
308  {
309  attributeType = AttributeType::Numeric;
310  }
311 
312  else if (typeStrUpper == "IGNORE")
313  {
314  attributeType = AttributeType::Ignore;
315  }
316 
317  else if (typeStrUpper == "SYMBOLIC")
318  {
319  attributeType = AttributeType::Symbolic;
320  }
321 
322  else
323  {
324  // We have a nominal field
325  attributeType = AttributeType::Nominal;
326  }
327 
328  bool alreadyExists = false;
329  fileDesc->AddAAttribute (name, attributeType, alreadyExists);
330  if (alreadyExists)
331  {
332  // Two fields with the same name, *** VERY BAD ***
333  _log.Level (-1) << endl
334  << "FeatureFileIOC45::ProcessC45AttrStr *** ERROR *** Field Name Occurs more than once."
335  << " AttrStr[" << attrStr << "]" << endl
336  << endl;
337  validStr = false;
338  return;
339  }
340 
341 
342  if (attributeType == AttributeType::Nominal)
343  {
344  // Will now parse out the nominal values.
345  while (!typeStr.Empty ())
346  {
347  KKStr nominalValue;
348 
349  kkint32 commaPos = C45LocateNextCharacter (typeStr, ',');
350  if (commaPos < 0)
351  {
352  nominalValue = typeStr;
353  typeStr = "";
354  }
355  else
356  {
357  nominalValue = typeStr.SubStrPart (0, commaPos - 1);
358  typeStr = typeStr.SubStrPart (commaPos + 1);
359  }
360 
361  C45StrPreProcessName (nominalValue);
362 
363  if (nominalValue.Empty ())
364  {
365  // Must have some characters in name.
366  _log.Level (-1) << endl
367  << "FileDesc::AddANominalValue *** ERROR ***" << endl
368  << " Blank NominalValue (\"\")" << endl
369  << " AttrStr [" << attrStr << "]." << endl
370  << endl;
371  validStr = false;
372  return;
373  }
374 
375  fileDesc->AddANominalValue (nominalValue, alreadyExists, _log);
376  if (alreadyExists)
377  {
378  _log.Level (-1) << endl
379  << "FileDesc::AddANominalValue *** ERROR ***" << endl
380  << " Nominal Value [" << nominalValue << "] occurs more than once." << endl
381  << " AttrStr [" << attrStr << "]." << endl
382  << endl;
383  validStr = false;
384  return;
385  }
386  }
387  }
388 } /* ProcessC45AttrStr */
389 
390 
391 
392 
393 
394 
395 
397  istream& _in,
398  MLClassListPtr _classes,
399  kkint32& _estSize,
400  KKStr& _errorMessage,
401  RunLog& _log
402  )
403 {
404  KKStr namesFileName;
405 
406 
407  bool classLineRead = false;
408 
409  KKStr ln (1024);
410  bool eof = false;
411  kkint32 lineNum = 0;
412  GetLine (_in, ln, eof); lineNum++;
413 
414  while ((!eof) && (!classLineRead))
415  {
416  C45StripComments (ln);
417 
418  if (!ln.Empty ())
419  {
420  // We have our very first line, this should consist
421  // of all the class names.
422 
423  // Eliminate any trailing period.
424  if (ln.LastChar () == '.')
425  ln.ChopLastChar ();
426 
427  VectorKKStr classNames = ln.Split ("\n\r\t,");
428  for (kkint32 idx = 0; idx < (kkint32)classNames.size (); idx++)
429  {
430  KKStr className = classNames[idx];
431  C45StrPreProcessName (className);
432  MLClassPtr mlClass = _classes->GetMLClassPtr (className);
433  }
434 
435  classLineRead = true;
436  }
437  else
438  {
439  GetLine (_in, ln, eof); lineNum++;
440  }
441  }
442 
443  if (!classLineRead)
444  {
445  _log.Level (-1) << endl
446  << "FeatureFileIOC45::GetFileDesc *** ERROR *** No class line in Names File." << endl
447  << endl;
448  _errorMessage = "No class line in Names File.";
449  return NULL;
450  }
451 
452 
453  FileDescPtr fileDesc = new FileDesc ();
454  fileDesc->AddClasses (*_classes);
455 
456  // Can now load in attribute data
457  GetLine (_in, ln, eof); lineNum++;
458  while (!eof)
459  {
460  C45StripComments (ln);
461 
462  // "ln" may consist of more than one attribute descriptor separated by periods.
463  while (!ln.Empty ())
464  {
465  bool validStr = true;
466 
467  // Locate a period followed by white space, if not followed by
468  // white space, will be part of a name.
469  kkuint32 dotPos = 0;
470  while (dotPos < ln.Len ())
471  {
472  if (ln[dotPos] == '.')
473  {
474  if (dotPos >= (ln.Len () - 1))
475  break;
476 
477  else if (strchr (" \t\r\n", ln[dotPos + 1]))
478  break;
479  }
480 
481  dotPos++;
482  }
483 
484  KKStr attrStr;
485 
486  if (dotPos >= ln.Len ())
487  {
488  attrStr = ln;
489  ln = "";
490  }
491  else
492  {
493  attrStr = ln.SubStrPart (0, dotPos - 1);
494  ln = ln.SubStrPart (dotPos + 1);
495  }
496 
497  KKStr origAttrStr (attrStr);
498  ProcessC45AttrStr (fileDesc, attrStr, validStr, _log);
499  if (!validStr)
500  {
501  _log.Level (-1) << endl
502  << "FeatureFileIOC45::GetFileDesc ***ERROR*** Invalid AttributeStr[" << origAttrStr << "]." << endl
503  << " LineNum[" << lineNum << "]" << endl
504  << endl;
505  _errorMessage = "No class line in Names File.";
506  _errorMessage << "Invalid AttributeStr[" << origAttrStr << "], LineNum[" << lineNum << "]";
507 
508  // Can not delete an instance of a 'FileDesc' class once it has been created.
509  // delete fileDesc; fileDesc = NULL;
510  return NULL;
511  }
512  }
513 
514  GetLine (_in, ln, eof); lineNum++;
515  }
516 
517  return fileDesc;
518 } /* GetFileDesc */
519 
520 
521 
522 
523 
524 
525 
526 
527 
528 
529 KKStr FeatureFileIOC45::C45ReadNextToken (istream& in,
530  const char* delimiters,
531  bool& eof,
532  bool& eol
533  )
534 {
535  eof = false;
536  eol = false;
537 
538  const kkint32 maxTokenLen = 1024;
539  char token[maxTokenLen];
540 
541  // lets skip leading white space
542  kkint32 ch = in.get (); eof = in.eof ();
543  while ((!eof) && ((ch == ' ') || (ch == '\r') || (ch == '\t')) && (ch != '\n'))
544  {ch = in.get (); eof = in.eof ();}
545 
546  if (ch == '\n')
547  {
548  eol = true;
549  if (in.peek () == '\r')
550  in.get ();
551  return "";
552  }
553 
554  if (ch == '\r')
555  {
556  eol = true;
557  if (in.peek () == '\n')
558  in.get ();
559  return "";
560  }
561 
562  else if (ch == '.')
563  {
564  // If next character is a white space, eol, eof, of '|'
565  // then treat as end of entry or '\n'.
566  char nextCh = in.peek ();
567  if (strchr (" \t\r\n|", nextCh))
568  {
569  eol = true;
570  return "";
571  }
572  }
573 
574 
575  else if (ch == '|')
576  {
577  // The rest of the line is meant to be a comment,
578  // we can skip all the following characters.
579  while ((!eof) && (ch != '\n') && (ch != '\r'))
580  {ch = in.get (); eof = in.eof ();}
581  eol = true;
582 
583  if (!eof)
584  {
585  if ((ch == '\n') && (in.peek () == '\r'))
586  in.get (); // I want to leave ch with '\n' in it.
587 
588  else if ((ch == '\r') && (in.peek () == '\n'))
589  ch = in.get ();
590  }
591  eof = in.eof ();
592  }
593 
594 
595  kkint32 tokenLen = 0;
596 
597  // Read till first delimiter or eof
598  while ((!eof) && (!strchr (delimiters, ch)))
599  {
600  if ((ch == '\n') || (ch == '|'))
601  {
602  in.putback (ch);
603  break;
604  }
605 
606  else if (ch == '.')
607  {
608  // Dots have special meaning when at the end of the line or followed
609  // by a white space character. In these cases they delimit a separated entry.
610  char nextCh = in.get (); bool nextEOF = in.eof ();
611  if (nextEOF)
612  {
613  // Since end of file will treat as end of line
614  in.putback (nextCh);
615  nextCh = in.get (); nextEOF = in.eof ();
616  ch = ' ';
617  break;
618  }
619  else
620  {
621  if (strchr (" \r\t", nextCh))
622  {
623  // Since next character is end of line, or white space them this period is
624  // marking end of entry, will put back into stream so that next call to
625  // C45ReadNextToken will get it next and treat as end of line.
626  in.putback ('.');
627  break;
628  }
629  else if ((nextCh == '\n') || (nextCh == '|'))
630  {
631  // This is a period that is at end of line, in this case
632  // we discard '.' and end token.
633  in.putback (nextCh);
634  break;
635  }
636  else
637  {
638  // This is a valid period, dot '.' character., so lets put the following
639  // character back in the stream
640  in.putback (nextCh);
641  }
642  }
643  }
644 
645  else
646  {
647  if (ch == '\\')
648  {
649  // We may have an escape character that c45 Allows
650  // http://www.cs.washington.edu/dm/vfml/appendixes/c45.htm
651 
652  char nextCh = in.get (); bool nextEOF = in.eof ();
653  if (nextEOF)
654  {
655  // Not Sure what will happen in this case, i don't want to set eof yet,
656  // but do on the next call, so will put character back on line and re-read
657  // this way the next call to this function will get a eof.
658  in.putback (nextCh);
659  nextCh = in.get (); nextEOF = in.eof ();
660  }
661  else
662  {
663  if (strchr (",?:", nextCh))
664  {
665  ch = nextCh;
666  }
667  else
668  {
669  // not a Escape Sequence, put following character back so we get it next time.
670  in.putback (nextCh);
671  }
672  }
673  }
674 
675  else if (strchr (" \t\r", ch))
676  {
677  // We will compress the white space characters to just one blank.
678  ch = ' ';
679 
680  char nextCh = in.get (); bool nextEOF = in.eof ();
681  while ((!nextEOF) && (strchr (" \t\r", nextCh)))
682  {nextCh = in.get (); nextEOF = in.eof ();}
683 
684  if (nextEOF)
685  {
686  // Not Sure what will happen in this case, i don't want to set eof yet,
687  // but do on the next call, so will put character back on line and reread
688  // this way the next call to this function will get a eof.
689  in.putback (nextCh);
690  nextCh = in.get (); nextEOF = in.eof ();
691  }
692  else
693  {
694  // This should be a valid characte, so will put back into file
695  // to read on next loop
696  in.putback (nextCh);
697  }
698  }
699  }
700 
701  token[tokenLen] = ch;
702  tokenLen++;
703  ch = in.get (); eof = in.eof ();
704  }
705 
706  token[tokenLen] = 0; // Terminating NULL character.
707 
708 
709  // Remove Training whitespace
710  while (tokenLen > 0)
711  {
712  if (strchr (" \r\t", token[tokenLen - 1]) == 0)
713  break;
714  tokenLen--;
715  token[tokenLen] = 0;
716  }
717 
718 
719  return token;
720 } /* C45ReadNextToken */
721 
722 
723 
724 
725 FeatureVectorListPtr FeatureFileIOC45::LoadFile (const KKStr& _fileName,
726  const FileDescPtr _fileDesc,
727  MLClassList& _classes,
728  istream& _in,
729  kkint32 _maxCount, // Maximum # images to load.
730  VolConstBool& _cancelFlag,
731  bool& _changesMade,
732  KKStr& _errorMessage,
733  RunLog& _log
734  )
735 {
736  _log.Level (10) << "FeatureFileIOC45::LoadFile FileName[" << _fileName << "]" << endl;
737 
738  bool eof = false;
739  bool eol = false;
740 
741  kkint32 numOfFeatures = _fileDesc->NumOfFields ();
742 
743  KKStr fileRootName = osGetRootName (_fileName);
744 
745  kkint32 lineCount = 0;
746 
747  const
748  AttributePtr* attributeTable = _fileDesc->CreateAAttributeTable (); // Caller will be responsible for deleting
749 
750  bool lineIsValid = true;
751 
752  KKStr imageFileName = "";
753 
754  FeatureVectorListPtr examples = new FeatureVectorList (_fileDesc, true);
755 
756  while (!eof)
757  {
758  lineIsValid = true;
759  imageFileName = "";
760  KKStr field = C45ReadNextToken (_in, ",", eof, eol);
761  if (eof)
762  break;
763 
764  if (eol)
765  {
766  // We have a blank line
767  continue;
768  }
769 
770  FeatureVectorPtr example = new FeatureVector (numOfFeatures);
771  kkint32 fieldNum = 0;
772 
773  // Process all fields for this row 'numOfFeatures'
774 
775  for (fieldNum = 0; fieldNum < numOfFeatures; fieldNum++)
776  {
777  if (eol || eof)
778  {
779  _errorMessage << "Not all Features were accounted for on Line[" << lineCount << "].";
780  _log.Level (-1) << endl << endl
781  << "FeatureFileIOC45::LoadFile " << _errorMessage << endl
782  << endl;
783  delete examples; examples = NULL;
784  delete example; example = NULL;
785  return NULL;
786  }
787 
788  switch (attributeTable[fieldNum]->Type ())
789  {
791  example->AddFeatureData (fieldNum, field.ToFloat ());
792  break;
793 
795  example->AddFeatureData (fieldNum, field.ToFloat ());
796  break;
797 
799  {
800  kkint32 code = -1; // Initialize to value for missing data.
801  if (field == "?")
802  {
803  // Will flag this entry as having missing data.
804  example->MissingData (true);
805  }
806  else
807  {
808  // This is not a missing data.
809  code = attributeTable[fieldNum]->GetNominalCode (field);
810  if (code < 0)
811  {
812  _errorMessage << "Invalid NominalValue[" << field << "] on line[" << lineCount << "].";
813  _log.Level (-1) << endl << endl
814  << "FeatureFileIOC45::LoadFile " << _errorMessage << endl
815  << endl;
816  delete examples; examples = NULL;
817  delete example; example = NULL;
818  return NULL;
819  }
820  }
821 
822  example->AddFeatureData (fieldNum, (float)code);
823  break;
824  }
825 
826 
828  {
829  kkint32 code = -1; // Initialize to value for missing data.
830  if (field == "?")
831  {
832  // Will flag this entry as having missing data.
833  example->MissingData (true);
834  }
835  else
836  {
837  // This is not a missing data.
838 
839  if (attributeTable[fieldNum]->Name ().EqualIgnoreCase ("ExampleFileName"))
840  imageFileName = field;
841 
842  code = attributeTable[fieldNum]->GetNominalCode (field);
843  if (code < 0)
844  {
845  bool alreadyExists = false;
846  attributeTable[fieldNum]->AddANominalValue (field, alreadyExists);
847  code = attributeTable[fieldNum]->GetNominalCode (field);
848  }
849  }
850 
851  example->AddFeatureData (fieldNum, (float)code);
852  break;
853  }
854 
855  default:
856  _log.Level (-1) << endl << endl
857  << "FeatureFileIOC45::LoadFile *** Undefined Field Type ***" << endl
858  << endl;
859  break;
860 
861  } /* End of switch */
862 
863  field = C45ReadNextToken (_in, " ,", eof, eol);
864  }
865 
866  // 'field' should have the class name in it
867  if ((field.Empty ()) || eol)
868  {
869  _errorMessage << "Line[" << lineCount << "] Missing ClassName.";
870  _log.Level (-1) << endl << endl
871  << "FeatureFileIOC45::LoadFile " << _errorMessage << endl
872  << endl;
873 
874  delete examples; examples = NULL;
875  delete example; example = NULL;
876  return NULL;
877  }
878 
879  MLClassPtr mlClass = NULL;
880 
881  if (field == "?")
882  {
883  // The class is unknown
884  mlClass = _fileDesc->LookUpUnKnownMLClass ();
885  }
886  else
887  {
888  mlClass = _fileDesc->LookUpMLClassByName (field);
889  if (!mlClass)
890  {
891  lineIsValid = false;
892  _errorMessage << "Line[" << lineCount << "] Invalid Class[" << field << "]";
893  _log.Level (-1) << endl << endl
894  << "FeatureFileIOC45::LoadFile " << _errorMessage << endl
895  << endl;
896  delete examples; examples = NULL;
897  delete example; example = NULL;
898  return NULL;
899  }
900  }
901 
902  example->MLClass (mlClass);
903 
904  if (imageFileName.Empty ())
905  imageFileName = fileRootName + "_" + StrFormatInt (lineCount, "ZZZZZ0");
906 
907  example->ExampleFileName (imageFileName);
908 
909 
910  if (lineIsValid)
911  examples->PushOnBack (example);
912 
913  lineCount++;
914 
915  if (!eof)
916  {
917  // Consume the rest of the characters in the line so that the next pass starts at
918  // the beginning of the next line.
919  kkint32 ch = _in.peek ();
920  while ((ch != '\n') && (ch != '\r') && (!_in.eof ()))
921  {
922  _in.get ();
923  ch = _in.peek ();
924  }
925  if (!_in.eof ())
926  {
927  _in.get ();
928  if ((ch == '\n') && (_in.peek () == '\r'))
929  _in.get ();
930 
931  else if ((ch == '\r') && (_in.peek () == '\n'))
932  _in.get ();
933  }
934  }
935 
936  if ((lineCount % 1000) == 0)
937  cout << "Records Loaded " << lineCount << endl;
938 
939  if ((kkint32)examples->size () > _maxCount)
940  break;
941  }
942 
943 
944  delete [] attributeTable;
945  return examples;
946 } /* LoadFile */
947 
948 
949 
950 
951 void FeatureFileIOC45::C45ConstructFileNameForWritting (const KKStr& fileName,
952  KKStr& namesFileName,
953  KKStr& dataFileName
954  )
955 {
956  KKStr c45Name;
957 
958  kkint32 lastDotPos = fileName.LocateLastOccurrence ('.');
959 
960  if (lastDotPos < 0)
961  {
962  // First try file name with ".data" extension, then with no extension
963  namesFileName = fileName + ".names";
964  dataFileName = fileName + ".data";
965  }
966  else
967  {
968  KKStr leedingPart = fileName.SubStrPart (0, lastDotPos);
969  KKStr extension = fileName.SubStrPart (lastDotPos + 1);
970  extension.Upper ();
971  if ((extension == "NAMES") || (extension == "NAME"))
972  {
973  namesFileName = fileName;
974  dataFileName = leedingPart + "data";
975  }
976 
977  else if ((extension == "DATA") || (extension == "TEST"))
978  {
979  // Since there is an extension and it is not the 'names' extension,
980  // will just assume the user knows what they are doing.
981  namesFileName = leedingPart + "names";
982  dataFileName = fileName;
983  }
984 
985  else
986  {
987  namesFileName = fileName + ".names";
988  dataFileName = fileName;
989  }
990  }
991 
992  return;
993 } /* C45ConstructFileNameForWritting */
994 
995 
996 
997 KKStr FeatureFileIOC45::C45AdjName (const KKStr& oldName)
998 {
999  kkuint32 x;
1000  KKStr newName (oldName.Len () + 3);
1001  for (x = 0; x < oldName.Len (); x++)
1002  {
1003  char ch = oldName[x];
1004 
1005  if (strchr (",:?", ch))
1006  {
1007  newName.Append ('\\');
1008  }
1009  newName.Append (ch);
1010  }
1011 
1012  return newName;
1013 } /* C45AdjName */
1014 
1015 
1016 
1017 
1018 
1019 
1021  const KKStr& _fileName,
1022  FeatureNumListConst& _selFeatures,
1023  ostream& _out,
1024  kkuint32& _numExamplesWritten,
1025  VolConstBool& _cancelFlag,
1026  bool& _successful,
1027  KKStr& _errorMessage,
1028  RunLog& _log
1029  )
1030 {
1031  KKStr namesFileName;
1032  KKStr dataFileName;
1033 
1034  _numExamplesWritten = 0;
1035 
1036  C45ConstructFileNameForWritting (_fileName, namesFileName, dataFileName);
1037 
1038  FileDescPtr fileDesc = _data.FileDesc ();
1039 
1040  const AttributePtr* attrTable = fileDesc->CreateAAttributeTable ();
1041 
1042  kkint32 x;
1043  {
1044  // Write out names file
1045  ofstream nf (namesFileName.Str ());
1046  MLClassListPtr classes = _data.ExtractListOfClasses ();
1047  for (x = 0; x < classes->QueueSize (); x++)
1048  {
1049  if (x > 0)
1050  nf << ", ";
1051  nf << C45AdjName (classes->IdxToPtr (x)->Name ());
1052  }
1053  nf << "." << endl;
1054 
1055  for (x = 0; x < _selFeatures.NumOfFeatures (); x++)
1056  {
1057  kkint32 featureNum = _selFeatures[x];
1058  AttributePtr attr = attrTable[featureNum];
1059  nf << C45AdjName (attr->Name ()) << ": ";
1060  if (attr->Type () == AttributeType::Nominal)
1061  {
1062  kkint32 y;
1063  for (y = 0; y < attr->Cardinality (); y++)
1064  {
1065  if (y > 0) nf << ", ";
1066  nf << C45AdjName (attr->GetNominalValue (y));
1067  }
1068  }
1069 
1070  else if (attr->Type () == AttributeType::Symbolic)
1071  {
1072  nf << "Symbolic";
1073  }
1074 
1075  else if (attr->Type () == AttributeType::Ignore)
1076  {
1077  nf << "ignore";
1078  }
1079 
1080  else
1081  {
1082  nf << "continuous";
1083  }
1084 
1085  nf << "." << endl;
1086  }
1087 
1088  nf << "ExampleFileName" << ": " << "Symbolic" << "." << endl;
1089 
1090  nf.close ();
1091  }
1092 
1093  {
1094  // Write out class statistics as comments at top of file.
1096  ClassStatisticList::iterator idx;
1097 
1098  _out << "| FileName [" << _fileName << "]" << endl;
1099  _out << "| DateWritten [" << osGetLocalDateTime () << "]" << endl;
1100  _out << "| SelectedFeatures [" << _selFeatures.ToString () << "]" << endl;
1101  _out << "| TotalRecords [" << _data.QueueSize () << "]" << endl;
1102  _out << "| NumAttributes [" << _selFeatures.NumOfFeatures () << "]" << endl;
1103  _out << "|" << endl;
1104  _out << "| Class Statistics" << endl;
1105  _out << "| Name" << "\t" << "Count" << endl;
1106 
1107  for (idx = stats->begin (); idx != stats->end (); idx++)
1108  {
1109  ClassStatisticPtr stat = *idx;
1110  _out << "| " << stat->Name () << "\t" << stat->Count () << endl;
1111  }
1112  _out << "|" << endl;
1113 
1114  delete stats;
1115  }
1116 
1117  kkint32 origPrecision = (kkint32)_out.precision ();
1118  _out.precision (9);
1119 
1120  FeatureVectorPtr example = NULL;
1121 
1122  kkint32 idx;
1123  for (idx = 0; (idx < _data.QueueSize ()) && (!_cancelFlag); idx++)
1124  {
1125  example = _data.IdxToPtr (idx);
1126 
1127  for (x = 0; x < _selFeatures.NumOfFeatures (); x++)
1128  {
1129  kkint32 featureNum = _selFeatures[x];
1130 
1131  if ((attrTable[featureNum]->Type () == AttributeType::Nominal) ||
1132  (attrTable[featureNum]->Type () == AttributeType::Symbolic)
1133  )
1134  {
1135  if (example->FeatureData (featureNum) == -1.0)
1136  {
1137  // Missing Data
1138  _out << "?";
1139  }
1140  else
1141  {
1142  _out << C45AdjName (attrTable[featureNum]->GetNominalValue ((kkint32)(example->FeatureData (featureNum))));
1143  }
1144  }
1145  else
1146  {
1147  _out << example->FeatureData (featureNum);
1148  }
1149  _out << ",";
1150  }
1151  _out << example->ExampleFileName () << ",";
1152  _out << example->ClassName ();
1153  _out << endl;
1154  _numExamplesWritten++;
1155  }
1156 
1157  _out.precision (origPrecision);
1158 
1159  if (!_cancelFlag)
1160  _successful = true;
1161 
1162  delete attrTable;
1163  return;
1164 } /* SaveFile */
KKStr(kkint32 size)
Creates a KKStr object that pre-allocates space for &#39;size&#39; characters.
Definition: KKStr.cpp:655
void ExampleFileName(const KKStr &_exampleFileName)
Name of source of feature vector, ex: file name of image that the feature vector was computed from...
Definition: FeatureVector.h:75
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
MLClass * MLClassPtr
Definition: MLClass.h:46
VectorKKStr Split(const char *delStr="\n\r\t, ") const
Breaks up the contents of the string into tokens where the characters in &#39;delStr&#39; acts as separates e...
Definition: KKStr.cpp:3480
void AddANominalValue(const KKStr &nominalValue, bool &alreadyExists)
Adds a allowable Nominal value to the Nominal or Symbolic field that this attribute represents...
Definition: Attribute.cpp:123
Provides a detailed description of the attributes of a dataset.
Definition: FileDesc.h:72
static FileDescPtr GetExistingFileDesc(FileDescPtr fileDesc)
Returns a pointer to an existing instance of &#39;fileDesc&#39; if it exists, otherwise will use one being pa...
Definition: FileDesc.cpp:555
bool EqualIgnoreCase(const char *s2) const
Definition: KKStr.cpp:1257
__int32 kkint32
Definition: KKBaseTypes.h:88
const KKStr & GetNominalValue(kkint32 code) const
Returns the nominal value for the given ordinal value.
Definition: Attribute.cpp:143
kkuint32 NumOfFields() const
Definition: FileDesc.h:197
void AddAAttribute(const KKB::KKStr &_name, KKMLL::AttributeType _type, bool &alreadyExists)
Definition: FileDesc.cpp:169
float FeatureData(kkint32 featureNum) const
KKStr & TrimRight(const char *whiteSpaceChars="\n\r\t ")
Definition: KKStr.cpp:1695
FeatureVector(kkint32 _numOfFeatures)
void ChopLastChar()
Definition: KKStr.cpp:1668
virtual FeatureVectorListPtr LoadFeatureFile(const KKStr &_fileName, MLClassList &_mlClasses, kkint32 _maxCount, VolConstBool &_cancelFlag, bool &_successful, bool &_changesMade, RunLog &_log)
Loads the contents of a feature data file and returns a ImageFeaturesList container object...
virtual FileDescPtr GetFileDesc(const KKStr &_fileName, istream &_in, MLClassListPtr _classList, kkint32 &_estSize, KKStr &_errorMessage, RunLog &log)
MLClassListPtr ExtractListOfClasses() const
kkint32 GetNominalCode(const KKStr &nominalValue) const
Definition: Attribute.cpp:185
Supports the reading and writing of feature data from C45 formated feature files. ...
KKStr & operator=(const char *src)
Definition: KKStr.cpp:1442
MLClassPtr LookUpUnKnownMLClass()
Definition: FileDesc.cpp:291
const FileDescPtr FileDesc() const
FeatureNumList const FeatureNumListConst
bool operator==(const char *rtStr) const
Definition: KKStr.cpp:1588
KKStr ToUpper() const
Definition: KKStr.cpp:2517
ClassStatistic * ClassStatisticPtr
virtual void SaveFile(FeatureVectorList &_data, const KKStr &_fileName, FeatureNumListConst &_selFeatures, ostream &_out, kkuint32 &_numExamplesWritten, VolConstBool &_cancelFlag, bool &_successful, KKStr &_errorMessage, RunLog &_log)
KKStr operator+(const char *right) const
Definition: KKStr.cpp:3986
void AddClasses(const MLClassList &classesToAdd)
Definition: FileDesc.cpp:196
void GetLine(std::istream &_in, KKStr &_line, bool &_eof)
unsigned __int32 kkuint32
Definition: KKBaseTypes.h:89
void AddFeatureData(kkint32 _featureNum, float _featureData)
MLClassPtr LookUpMLClassByName(const KKStr &className)
Definition: FileDesc.cpp:284
char operator[](kkuint32 i) const
Definition: KKStr.cpp:3430
FeatureVectorList(FileDescPtr _fileDesc, bool _owner)
Will create a new empty list of FeatureVector&#39;s.
KKStr & operator=(KKStr &&src)
Definition: KKStr.cpp:1369
Container class for FeatureVector derived objects.
Attribute * AttributePtr
Definition: Attribute.h:156
char LastChar() const
Definition: KKStr.cpp:2007
kkuint32 Len() const
Returns the number of characters in the string.
Definition: KKStr.h:366
KKTHread * KKTHreadPtr
void Append(char ch)
Definition: KKStr.cpp:1863
kkuint16 operator[](kkint32 idx) const
Returns back the selected feature.
KKStr(const KKStr &str)
Copy Constructor.
Definition: KKStr.cpp:561
kkint32 NumOfFeatures() const
void TrimLeft(const char *whiteSpaceChars="\n\r\t ")
Definition: KKStr.cpp:1745
Base class for all FeatureFileIO classes.
Definition: FeatureFileIO.h:48
bool Empty() const
Definition: KKStr.h:241
KKStr SubStrPart(kkint32 firstChar, kkint32 lastChar) const
returns a SubString consisting of all characters starting at index &#39;firstChar&#39; and ending at &#39;lastInd...
Definition: KKStr.cpp:2802
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of &#39;std::string&#39; strings.
Definition: KKStr.cpp:1082
ClassStatisticListPtr GetClassStatistics() const
Returns the number of FeatureVectors per class.
void Upper()
Converts all characters in string to their Upper case equivalents via &#39;toupper&#39;.
Definition: KKStr.cpp:2461
kkint32 LocateLastOccurrence(char ch) const
Returns index of last occurrence of &#39;ch&#39; otherwise -1.
Definition: KKStr.cpp:2118
AttributeType
Definition: Attribute.h:36
void AddANominalValue(const KKStr &nominalValue, bool &alreadyExist, RunLog &log)
Definition: FileDesc.cpp:242
void MLClass(MLClassPtr _mlClass)
Assign a class to this example.
Definition: FeatureVector.h:74
FileDesc * FileDescPtr
KKStr StrFormatInt(kkint32 val, const char *mask)
Definition: KKStr.cpp:5004
ClassStatisticList * ClassStatisticListPtr
AttributeType Type() const
Definition: Attribute.h:133
std::ostream &__cdecl operator<<(std::ostream &os, const KKStr &str)
KKStr operator+(const KKStr &right) const
Definition: KKStr.cpp:3998
const KKMLL::AttributePtr * CreateAAttributeTable() const
Definition: FileDesc.cpp:408
bool osFileExists(const KKStr &_fileName)
Definition: OSservices.cpp:568
KKStr & operator=(const KKStr &src)
Definition: KKStr.cpp:1390
Used for logging messages.
Definition: RunLog.h:49
void EncodeProblem(const struct svm_paramater &param, struct svm_problem &prob_in, struct svm_problem &prob_out)
MLClassList * MLClassListPtr
Definition: MLClass.h:49
virtual MLClassPtr GetMLClassPtr(const KKStr &_name)
return pointer to instance with &#39;_name&#39;; if none exists, create one and add to list.
Definition: MLClass.cpp:861
const KKStr & Name() const
Definition: Attribute.h:122
const KKStr & ClassName() const
Name of class that this example is assigned to.
float ToFloat() const
Definition: KKStr.cpp:3553
Maintains a list of MLClass instances.
Definition: MLClass.h:233
kkint32 Cardinality() const
Returns back the cardinality of the attribute; the number of possible values it can take...
Definition: Attribute.cpp:173
Represents a Feature Vector of a single example, labeled or unlabeled.
Definition: FeatureVector.h:59
virtual FeatureVectorListPtr LoadFile(const KKStr &_fileName, const FileDescPtr _fileDesc, MLClassList &_classes, istream &_in, kkint32 _maxCount, VolConstBool &_cancelFlag, bool &_changesMade, KKStr &_errorMessage, RunLog &_log)
#define int32_max
Definition: KKBaseTypes.h:119
KKStr SubStrPart(kkint32 firstChar) const
returns a SubString consisting of all characters starting at index &#39;firstChar&#39; until the end of the s...
Definition: KKStr.cpp:2780
void MissingData(bool _missingData)
True indicates that not all the feature data was present when this example was loaded from a data fil...
Definition: FeatureVector.h:76
FeatureFileIO(const KKStr &_driverName, bool _canRead, bool _canWrite)
KKStr osGetRootName(const KKStr &fullFileName)
volatile const bool VolConstBool
Definition: KKBaseTypes.h:163