KSquare Utilities
FeatureEncoder2.cpp
Go to the documentation of this file.
1 #include "FirstIncludes.h"
2 #include <stdio.h>
3 #include <string>
4 #include <iostream>
5 #include <fstream>
6 #include <math.h>
7 #include <vector>
8 #include <sstream>
9 #include <iomanip>
10 #include "MemoryDebug.h"
11 using namespace std;
12 
13 
14 #include "KKBaseTypes.h"
15 #include "OSservices.h"
16 #include "RunLog.h"
17 using namespace KKB;
18 
19 
20 #include "FeatureEncoder2.h"
21 #include "BinaryClassParms.h"
22 #include "FeatureNumList.h"
23 #include "FeatureVector.h"
24 #include "SvmWrapper.h"
25 using namespace KKMLL;
26 
27 /**
28  *@brief Constructs a Feature Encoder object.
29  *@param[in] _param
30  *@param[in] _fileDesc
31  *@param[in] _log A log file stream. All important events will be output to this stream
32  */
34  FileDescPtr _fileDesc
35  ):
36  attributeVector (_fileDesc->AttributeVector ()),
37  cardinalityDest (NULL),
38  cardinalityVector (_fileDesc->CardinalityVector ()),
39  codedNumOfFeatures (0),
40  destFeatureNums (NULL),
41  destWhatToDo (NULL),
42  encodedFileDesc (NULL),
44  fileDesc (_fileDesc),
45  numOfFeatures (0),
46  srcFeatureNums (NULL),
47  param (_param)
48 
49 {
50  FeatureNumListConstPtr selectedFeatures = param.SelectedFeatures ();
51  numOfFeatures = param.SelectedFeatures ()->NumOfFeatures ();
52 
53  encodingMethod = param.EncodingMethod ();
54 
55  srcFeatureNums = new kkuint16 [numOfFeatures];
56  cardinalityDest = new kkint32 [numOfFeatures];
57  destFeatureNums = new kkint32 [numOfFeatures];
58  destWhatToDo = new FeWhatToDo[numOfFeatures];
59 
60  VectorKKStr destFieldNames;
61 
62  kkint32 x;
63 
64  for (x = 0; x < numOfFeatures; x++)
65  {
66  kkuint16 srcFeatureNum = (*selectedFeatures)[x];
67  srcFeatureNums [x] = srcFeatureNum;
68  destFeatureNums [x] = codedNumOfFeatures;
69  cardinalityDest [x] = 1;
70  destWhatToDo [x] = FeWhatToDo::FeAsIs;
71 
72  Attribute srcAttribute = (fileDesc->Attributes ())[srcFeatureNum];
73 
74  switch (encodingMethod)
75  {
76  case ModelParam::EncodingMethodType::Binary:
77  if ((attributeVector[srcFeatureNum] == AttributeType::Nominal) ||
78  (attributeVector[srcFeatureNum] == AttributeType::Symbolic)
79  )
80  {
81  destWhatToDo [x] = FeWhatToDo::FeBinary;
82  cardinalityDest [x] = cardinalityVector[srcFeatureNums [x]];
83  codedNumOfFeatures += cardinalityDest[x];
84  for (kkint32 zed = 0; zed < cardinalityDest[x]; zed++)
85  {
86  KKStr fieldName = srcAttribute.Name () + "_" + srcAttribute.GetNominalValue (zed);
87  destFieldNames.push_back (fieldName);
88  }
89  }
90  else
91  {
92  codedNumOfFeatures++;
93  destWhatToDo [x] = FeWhatToDo::FeAsIs;
94  destFieldNames.push_back (srcAttribute.Name ());
95  }
96  break;
97 
98 
100  codedNumOfFeatures++;
101  if ((attributeVector[srcFeatureNums[x]] == AttributeType::Nominal) ||
102  (attributeVector[srcFeatureNums[x]] == AttributeType::Symbolic)
103  )
104  destWhatToDo [x] = FeWhatToDo::FeScale;
105  else
106  destWhatToDo [x] = FeWhatToDo::FeAsIs;
107 
108  destFieldNames.push_back (srcAttribute.Name ());
109  break;
110 
111 
113  default:
114  codedNumOfFeatures++;
115  destWhatToDo [x] = FeWhatToDo::FeAsIs;
116  destFieldNames.push_back (srcAttribute.Name ());
117  break;
118  }
119  }
120 
121  encodedFileDesc = FileDesc::NewContinuousDataOnly (destFieldNames);
122 }
123 
124 
125 
126 
128  attributeVector (_encoder.attributeVector),
129  cardinalityDest (NULL),
130  cardinalityVector (_encoder.cardinalityVector),
131  codedNumOfFeatures (_encoder.codedNumOfFeatures),
132  destFeatureNums (NULL),
133  destWhatToDo (NULL),
134  encodedFileDesc (_encoder.encodedFileDesc),
135  encodingMethod (_encoder.encodingMethod),
136  fileDesc (_encoder.fileDesc),
137  numOfFeatures (_encoder.numOfFeatures),
138  srcFeatureNums (NULL),
139  param (_encoder.param)
140 {
141  cardinalityDest = new kkint32[numOfFeatures];
142  destFeatureNums = new kkint32[numOfFeatures];
143  destWhatToDo = new FeWhatToDo[numOfFeatures];
144  srcFeatureNums = new kkuint16[numOfFeatures];
145 
146  kkint32 x;
147  for (x = 0; x < numOfFeatures; x++)
148  {
149  srcFeatureNums [x] = _encoder.srcFeatureNums [x];
150  destFeatureNums [x] = _encoder.destFeatureNums[x];
151  cardinalityDest [x] = _encoder.cardinalityDest[x];
152  destWhatToDo [x] = _encoder.destWhatToDo [x];
153  srcFeatureNums [x] = _encoder.srcFeatureNums [x];
154  }
155 }
156 
157 
158 
159 /**
160  * @brief Frees any memory allocated by, and owned by the FeatureEncoder2
161  */
163 {
164  delete srcFeatureNums;
165  delete destFeatureNums;
166  delete cardinalityDest;
167  delete destWhatToDo;
168 }
169 
170 
172 {
173  kkint32 memoryConsumedEstimated = sizeof (FeatureEncoder2)
174  + attributeVector.size () * sizeof (AttributeType)
175  + cardinalityVector.size () * sizeof (kkint32);
176 
177  if (cardinalityDest) memoryConsumedEstimated += 2 * numOfFeatures * sizeof (kkint32); // For 'cardinalityDest', 'destFeatureNums'
178  if (destFeatureNums) memoryConsumedEstimated += numOfFeatures * sizeof (kkint32);
179  if (destWhatToDo) memoryConsumedEstimated += numOfFeatures * sizeof (FeWhatToDo);
180  if (srcFeatureNums) memoryConsumedEstimated += numOfFeatures * sizeof (kkuint16);
181 
182  return memoryConsumedEstimated;
183 }
184 
185 
186 
188 {
189  return encodedFileDesc->NumOfFields ();
190 }
191 
192 
193 
194 
196  RunLog& log
197  ) const
198 {
199  log.Level (40) << "FeatureEncoder2::CreateEncodedFileDesc" << endl;
200  FileDescPtr newFileDesc = new FileDesc ();
201 
202  if (o)
203  {
204  *o << endl
205  << "Orig" << "\t" << "Orig" << "\t" << "Field" << "\t" << "Encoded" << "\t" << "Encoded" << endl;
206  *o << "FieldNum" << "\t" << "FieldName" << "\t" << "Type" << "\t" << "FieldNum" << "\t" << "FieldName" << endl;
207  }
208 
209  kkint32 x;
210 
211  bool alreadyExist;
212 
213  for (x = 0; x < numOfFeatures; x++)
214  {
215  kkuint16 srcFeatureNum = srcFeatureNums[x];
216  kkint32 y = destFeatureNums[x];
217 
218  if (y >= codedNumOfFeatures)
219  {
220  log.Level(-1)
221  << endl
222  << "FeatureEncoder2::CreateEncodedFileDesc ***ERROR***" << endl
223  << " overriding number of encoded features. This should never be able to happen." << endl
224  << " Something is wrong with object." << endl
225  << endl;
227  exit (-1);
228  }
229 
230  KKStr origFieldDesc = StrFormatInt (srcFeatureNum, "zz0") + "\t" +
231  fileDesc->FieldName (srcFeatureNum) + "\t" +
232  fileDesc->TypeStr (srcFeatureNum);
233 
234  switch (destWhatToDo[x])
235  {
236  case FeWhatToDo::FeAsIs:
237  {
238  newFileDesc->AddAAttribute (fileDesc->FieldName (x), AttributeType::Numeric, alreadyExist);
239  if (o)
240  {
241  *o << origFieldDesc << "\t"
242  << y << "\t"
243  << fileDesc->FieldName (x)
244  << endl;
245  }
246  }
247  break;
248 
250  {
251  for (kkint32 z = 0; z < cardinalityDest[x]; z++)
252  {
253  KKStr nominalValue = fileDesc->GetNominalValue (srcFeatureNums[x], z);
254  KKStr encodedName = fileDesc->FieldName (x) + "_" + nominalValue;
255  newFileDesc->AddAAttribute (encodedName, AttributeType::Numeric, alreadyExist);
256  if (o)
257  {
258  *o << origFieldDesc << "\t"
259  << y << "\t"
260  << encodedName
261  << endl;
262  }
263 
264  y++;
265  }
266  }
267 
268  break;
269 
270  case FeWhatToDo::FeScale:
271  {
272  newFileDesc->AddAAttribute (fileDesc->FieldName (x), AttributeType::Numeric, alreadyExist);
273  if (o)
274  {
275  *o << origFieldDesc << "\t"
276  << y << "\t"
277  << fileDesc->FieldName (x)
278  << endl;
279  }
280  }
281  break;
282  }
283  }
284 
285  newFileDesc = FileDesc::GetExistingFileDesc (newFileDesc);
286 
287  return newFileDesc;
288 } /* CreateEncodedFileDesc */
289 
290 
291 
292 
293 
294 FeatureVectorPtr FeatureEncoder2::EncodeAExample (FeatureVectorPtr src) const
295 {
296  FeatureVectorPtr encodedImage = new FeatureVector (codedNumOfFeatures);
297  encodedImage->MLClass (src->MLClass ());
298  encodedImage->PredictedClass (src->PredictedClass ());
299  encodedImage->TrainWeight (src->TrainWeight ());
300 
301  const float* featureData = src->FeatureData ();
302  kkint32 x;
303 
304  for (x = 0; x < numOfFeatures; x++)
305  {
306  float featureVal = featureData [srcFeatureNums[x]];
307  kkint32 y = destFeatureNums[x];
308 
309  switch (destWhatToDo[x])
310  {
311  case FeWhatToDo::FeAsIs:
312  {
313  encodedImage->AddFeatureData (y, featureVal);
314  }
315  break;
316 
318  {
319  for (kkint32 z = 0; z < cardinalityDest[x]; z++)
320  {
321  float bVal = ((kkint32)featureVal == z);
322  encodedImage->AddFeatureData (y, bVal);
323  y++;
324  }
325  }
326 
327  break;
328 
329  case FeWhatToDo::FeScale:
330  {
331  encodedImage->AddFeatureData (y, (featureVal / (float)cardinalityDest[x]));
332  }
333  break;
334  }
335  }
336 
337  return encodedImage;
338 } /* EncodeAExample */
339 
340 
341 
342 
343 
344 FeatureVectorListPtr FeatureEncoder2::EncodeAllExamples (const FeatureVectorListPtr srcData)
345 {
346  FeatureVectorListPtr encodedExamples = new FeatureVectorList (encodedFileDesc,
347  true // Will own the contents
348  );
349 
350  FeatureVectorList::const_iterator idx;
351 
352  for (idx = srcData->begin (); idx != srcData->end (); idx++)
353  {
354  const FeatureVectorPtr srcExample = *idx;
355  FeatureVectorPtr encodedExample = EncodeAExample (srcExample);
356  encodedExamples->PushOnBack (encodedExample);
357  }
358 
359  return encodedExamples;
360 } /* EncodeAllImages */
361 
362 
363 
364 
365 FeatureVectorListPtr FeatureEncoder2::EncodedFeatureVectorList (const FeatureVectorList& srcData) const
366 {
367  if (srcData.AllFieldsAreNumeric ())
368  return srcData.DuplicateListAndContents ();
369 
370  FeatureVectorListPtr encodedFeatureVectorList = new FeatureVectorList (encodedFileDesc, true);
371 
372  FeatureVectorList::const_iterator idx;
373  for (idx = srcData.begin (); idx != srcData.end (); idx++)
374  {
375  FeatureVectorPtr srcExample = *idx;
376  FeatureVectorPtr encodedFeatureVector = EncodeAExample (srcExample);
377  encodedFeatureVector->MLClass (srcExample->MLClass ());
378  encodedFeatureVectorList->PushOnBack (encodedFeatureVector);
379  }
380 
381  return encodedFeatureVectorList;
382 } /* EncodedFeatureVectorList */
383 
384 
385 
387 {
388  FeatureVar2 (kkint32 _featureNum,
389  AttributeType _attributeType,
390  kkint32 _idx,
391  double _var
392  ):
393  attributeType (_attributeType),
394  featureNum (_featureNum),
395  idx (_idx),
396  var (_var)
397  {}
398 
402  double var;
403 };
404 
405 
406 //typedef FeatureEncoder2::FeatureVar2* FeatureVar2Ptr;
407 
409 {
410 public:
411  FeatureVar2List (bool _owner):
413  {}
414 
416  {}
417 };
KKStr TypeStr(kkint32 fieldNum) const
Definition: FileDesc.cpp:378
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
FileDescPtr CreateEncodedFileDesc(ostream *o, RunLog &log) const
FeatureEncoder2(const ModelParam &_param, FileDescPtr _fileDesc)
Constructs a Feature Encoder object.
Provides a detailed description of the attributes of a dataset.
Definition: FileDesc.h:72
const VectorInt32 & CardinalityVector() const
Definition: FileDesc.h:116
static FileDescPtr GetExistingFileDesc(FileDescPtr fileDesc)
Returns a pointer to an existing instance of &#39;fileDesc&#39; if it exists, otherwise will use one being pa...
Definition: FileDesc.cpp:555
__int32 kkint32
Definition: KKBaseTypes.h:88
kkint32 NumEncodedFeatures() const
bool AllFieldsAreNumeric() const
Returns true if all fields are numeric, no nominal fields.
kkuint32 NumOfFields() const
Definition: FileDesc.h:197
void AddAAttribute(const KKB::KKStr &_name, KKMLL::AttributeType _type, bool &alreadyExists)
Definition: FileDesc.cpp:169
const float * FeatureData() const
Returns as a pointer to the feature data itself.
FeatureVector(kkint32 _numOfFeatures)
void osWaitForEnter()
kkint32 MemoryConsumedEstimated() const
unsigned __int16 kkuint16
16 bit unsigned integer.
Definition: KKBaseTypes.h:86
FeatureVectorListPtr EncodedFeatureVectorList(const FeatureVectorList &srcData) const
const AttributeTypeVector & AttributeVector() const
Definition: FileDesc.h:115
FeatureVar2(kkint32 _featureNum, AttributeType _attributeType, kkint32 _idx, double _var)
~FeatureEncoder2()
Frees any memory allocated by, and owned by the FeatureEncoder2.
virtual FeatureVectorListPtr DuplicateListAndContents() const
Creates a duplicate of list and also duplicates it contents.
void TrainWeight(float _trainWeight)
Assign a specific example a higher weight for training purposes.
virtual EncodingMethodType EncodingMethod() const
Definition: ModelParam.h:111
KKStr operator+(const char *right) const
Definition: KKStr.cpp:3986
const KKStr & FieldName(kkint32 fieldNum) const
Definition: FileDesc.cpp:387
void AddFeatureData(kkint32 _featureNum, float _featureData)
MLClassPtr PredictedClass() const
FeatureVectorList(FileDescPtr _fileDesc, bool _owner)
Will create a new empty list of FeatureVector&#39;s.
describes a single Feature, Type and possible values.
Definition: Attribute.h:74
Container class for FeatureVector derived objects.
KKTHread * KKTHreadPtr
kkuint16 operator[](kkint32 idx) const
Returns back the selected feature.
const KKStr & GetNominalValue(kkint32 fieldNum, kkint32 code) const
Definition: FileDesc.cpp:395
kkint32 NumOfFeatures() const
virtual FeatureNumListConstPtr SelectedFeatures() const
Definition: ModelParam.h:116
FeatureEncoder2(const FeatureEncoder2 &_encoder)
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of &#39;std::string&#39; strings.
Definition: KKStr.cpp:1082
AttributeType
Definition: Attribute.h:36
MLClassPtr MLClass() const
Class that is example is assigned to.
void MLClass(MLClassPtr _mlClass)
Assign a class to this example.
Definition: FeatureVector.h:74
FileDesc * FileDescPtr
KKStr StrFormatInt(kkint32 val, const char *mask)
Definition: KKStr.cpp:5004
KKStr operator+(const KKStr &right) const
Definition: KKStr.cpp:3998
static FileDescPtr NewContinuousDataOnly(VectorKKStr &_fieldNames)
Creates a simple FileDesc that consists of continuous data only.
Definition: FileDesc.cpp:116
void PredictedClass(MLClassPtr _predictedClass)
Definition: FeatureVector.h:78
FeatureNumListConst * FeatureNumListConstPtr
Used for logging messages.
Definition: RunLog.h:49
void EncodeProblem(const struct svm_paramater &param, struct svm_problem &prob_in, struct svm_problem &prob_out)
float TrainWeight() const
Represents a Feature Vector of a single example, labeled or unlabeled.
Definition: FeatureVector.h:59
Abstract Base class for Machine Learning parameters.
Definition: ModelParam.h:35
FeatureVectorPtr EncodeAExample(FeatureVectorPtr src) const
FeatureVectorListPtr EncodeAllExamples(const FeatureVectorListPtr srcData)