28 using namespace KKMLL;
32 cardinalityDest (NULL),
35 codedNumOfFeatures (0),
37 destFeatureNums (NULL),
42 numEncodedFeatures (0),
45 srcFeatureNums (NULL),
46 xSpaceNeededPerExample (0)
67 cardinalityDest (NULL),
70 codedNumOfFeatures (0),
72 destFeatureNums (NULL),
75 encodingMethod (_encodingMethod),
77 numEncodedFeatures (0),
79 selectedFeatures
(_selectedFeatures
),
80 srcFeatureNums (NULL),
81 xSpaceNeededPerExample (0)
85 xSpaceNeededPerExample = 0;
86 srcFeatureNums =
new kkint32[numOfFeatures];
87 cardinalityDest =
new kkint32[numOfFeatures];
88 destFeatureNums =
new kkint32[numOfFeatures];
95 for (x = 0; x < numOfFeatures; x++)
98 srcFeatureNums [x] = srcFeatureNum;
99 destFeatureNums [x] = xSpaceNeededPerExample;
100 cardinalityDest [x] = 1;
107 switch (encodingMethod)
113 cardinalityDest [x] = cardinality;
114 xSpaceNeededPerExample += cardinalityDest[x];
115 numEncodedFeatures += cardinalityDest[x];
116 for (
kkint32 zed = 0; zed < cardinalityDest[x]; zed++)
119 destFieldNames.push_back (fieldName);
124 xSpaceNeededPerExample++;
125 numEncodedFeatures++;
127 destFieldNames.push_back (attribute.Name ());
133 xSpaceNeededPerExample++;
134 numEncodedFeatures++;
142 destFieldNames.push_back (attribute.Name ());
148 xSpaceNeededPerExample++;
149 numEncodedFeatures++;
151 destFieldNames.push_back (attribute.Name ());
156 codedNumOfFeatures = xSpaceNeededPerExample;
160 xSpaceNeededPerExample++;
174 delete srcFeatureNums;
175 delete destFeatureNums;
176 delete cardinalityDest;
186 + numOfFeatures *
sizeof (
kkint32);
189 memoryConsumedEstimated += 3 *
sizeof (
kkint32) * numOfFeatures;
193 memoryConsumedEstimated +=
sizeof (
FeWhatToDo) * numOfFeatures;
195 return memoryConsumedEstimated;
208 <<
"Orig" <<
"\t" <<
"Orig" <<
"\t" <<
"Field" <<
"\t" <<
"Encoded" <<
"\t" <<
"Encoded" << endl;
209 *o <<
"FieldNum" <<
"\t" <<
"FieldName" <<
"\t" <<
"Type" <<
"\t" <<
"FieldNum" <<
"\t" <<
"FieldName" << endl;
216 for (x = 0; x < numOfFeatures; x++)
218 kkint32 srcFeatureNum = srcFeatureNums[x];
219 kkint32 y = destFeatureNums[x];
221 if (y >= numEncodedFeatures)
224 errMsg <<
"FeatureEncoder::CreateEncodedFileDesc numEncodedFeatures [" << numEncodedFeatures <<
"] exceeded.";
226 <<
"FeatureEncoder::CreateEncodedFileDesc *** ERROR ***" << endl
227 <<
" " << errMsg << endl
238 switch (destWhatToDo[x])
245 *o << origFieldDesc <<
"\t" 247 << fileDesc->FieldName (x)
255 for (
kkint32 z = 0; z < cardinalityDest[x]; z++)
262 *o << origFieldDesc <<
"\t" 279 *o << origFieldDesc <<
"\t" 281 << fileDesc->FieldName (x)
304 XSpacePtr xSpace =
new svm_node[xSpaceNeededPerExample];
317 FeatureVectorPtr encodedExample =
new FeatureVector (numEncodedFeatures
);
326 for (x = 0; x < numOfFeatures; x++)
328 float featureVal = featureData [srcFeatureNums[x]];
329 kkint32 y = destFeatureNums[x];
331 switch (destWhatToDo[x])
341 for (
kkint32 z = 0; z < cardinalityDest[x]; z++)
343 float bVal = ((
kkint32)featureVal == z);
359 return encodedExample;
372 FeatureVectorList::const_iterator idx;
374 for (idx = srcData->begin (); idx != srcData->end (); idx++)
376 const FeatureVectorPtr srcExample = *idx;
381 return encodedExamples;
402 for (x = 0; x < numOfFeatures; x++)
404 float featureVal = featureData [srcFeatureNums[x]];
405 kkint32 y = destFeatureNums[x];
407 if (y >= xSpaceNeededPerExample)
410 errMsg <<
"FeatureEncoder::EncodeAExample ***ERROR*** xSpaceNeededPerExample[" << xSpaceNeededPerExample <<
"].";
412 <<
"FeatureEncoder::EncodeAExample *** ERROR ***" << endl
413 <<
" " << errMsg << endl
418 switch (destWhatToDo[x])
422 if (featureVal != 0.0)
424 xSpace[xSpaceUsed]
.index = y;
425 xSpace[xSpaceUsed]
.value = featureVal;
433 for (
kkint32 z = 0; z < cardinalityDest[x]; z++)
435 float bVal = ((
kkint32)featureVal == z);
438 xSpace[xSpaceUsed]
.index = y;
439 xSpace[xSpaceUsed]
.value = bVal;
450 if (featureVal != (
float)0.0)
452 xSpace[xSpaceUsed]
.index = y;
453 xSpace[xSpaceUsed]
.value = featureVal / (
float)cardinalityDest[x];
461 xSpace[xSpaceUsed]
.index = -1;
462 xSpace[xSpaceUsed]
.value = -1;
471 FeatureVectorList::const_iterator idx;
472 for (idx = src->begin (); idx != src->end (); ++idx)
474 FeatureVectorPtr fv = *idx;
477 for (
kkint32 x = 0; x < numOfFeatures; x++)
479 float featureVal = featureData [srcFeatureNums[x]];
480 kkint32 y = destFeatureNums[x];
482 switch (destWhatToDo[x])
485 if (featureVal != 0.0)
490 for (
kkint32 z = 0; z < cardinalityDest[x]; z++)
492 float bVal = ((
kkint32)featureVal == z);
500 if (featureVal != (
float)0.0)
508 return xSpaceNodesNeeded;
514 (FeatureVectorListPtr src,
523 FeatureVectorListPtr compressedExamples = NULL;
524 FeatureVectorListPtr examplesToUseFoXSpace = NULL;
529 examplesToUseFoXSpace = src;
531 kkint32 numOfExamples = examplesToUseFoXSpace->QueueSize ();
534 prob
.l = numOfExamples;
535 prob.y = (
double*)malloc (prob.l *
sizeof (
double));
536 prob.x = (
struct svm_node **) malloc (prob.l *
sizeof (
struct svm_node*));
538 prob.exampleNames.clear ();
540 kkint32 numNeededXspaceNodes = DetermineNumberOfNeededXspaceNodes (examplesToUseFoXSpace);
542 kkint32 totalBytesForxSpaceNeeded = (numNeededXspaceNodes + 10) *
sizeof (
struct svm_node);
544 xSpace = (
struct svm_node*) malloc (totalBytesForxSpaceNeeded);
547 log.Level (-1) << endl << endl << endl
548 <<
" FeatureEncoder::Compress *** Failed to allocates space for 'xSpace' ****" << endl
550 <<
" Space needed [" << totalBytesForxSpaceNeeded <<
"]" << endl
551 <<
" Num of Examples [" << numOfExamples <<
"]" << endl
552 <<
" Num XSpaceNodesNeeded [" << numNeededXspaceNodes <<
"]" << endl
562 FeatureVectorPtr example = NULL;
566 kkint32 bytesOfxSpacePerExample = xSpaceNeededPerExample *
sizeof (
struct svm_node);
568 for (i = 0; i < prob
.l; i++)
570 if (totalxSpaceUsed > numNeededXspaceNodes)
572 log.Level (-1) << endl << endl
573 <<
"FeatureEncoder::Compress ***ERROR*** We have exceeded the number of XSpace nodes allocated." << endl
577 example = examplesToUseFoXSpace->IdxToPtr (i);
585 prob
.y[i] = lastClassNum;
587 prob.exampleNames.push_back (osGetRootName (example->ExampleFileName ()));
594 log.Level (-1) << endl
595 <<
"FeatureEncoder::EncodeIntoSparseMatrix ***ERROR*** Example[" << example->ExampleFileName () <<
"]" << endl
596 <<
" has a TrainWeight value of 0 or less defaulting to 1.0" << endl
598 prob
.W[i] = 1.0 * c_Param;
604 struct svm_node* xSpaceThisExample = (
struct svm_node*) malloc (bytesOfxSpacePerExample);
605 prob
.x[i] = xSpaceThisExample;
607 if (xSpaceUsed < xSpaceNeededPerExample)
609 kkint32 bytesNeededForThisExample = xSpaceUsed *
sizeof (
struct svm_node);
610 struct svm_node* smallerXSpaceThisExample = (
struct svm_node*) malloc (bytesNeededForThisExample);
611 memcpy (smallerXSpaceThisExample, xSpaceThisExample, bytesNeededForThisExample);
612 free (xSpaceThisExample);
613 prob
.x[i] = smallerXSpaceThisExample;
618 prob
.x[i] = &xSpace[totalxSpaceUsed];
621 totalxSpaceUsed += xSpaceUsed;
624 delete compressedExamples;
638 FeatureVectorListPtr compressedExamples,
642 double time_before, time_after;
646 compressedExamples->Owner (
false);
660 FeatureVectorList::iterator idx;
661 for (idx = srcData.begin (); idx != srcData.end (); idx++)
663 FeatureVectorPtr srcExample = *idx;
667 FeatureVectorPtr encodedFeatureVector =
new FeatureVector (codedNumOfFeatures
);
668 while (encodedData[zed]
.index != -1)
681 return encodedFeatureVectorList;
698 XmlElementInt32::WriteXML (codedNumOfFeatures,
"CodedNumOfFeatures", o);
699 XmlElementDouble::WriteXML (c_Param,
"c_Param", o);
700 XmlElementInt32::WriteXML (numEncodedFeatures,
"NumEncodedFeatures", o);
701 XmlElementInt32::WriteXML (numOfFeatures,
"NumOfFeatures", o);
702 XmlElementInt32::WriteXML (xSpaceNeededPerExample,
"xSpaceNeededPerExample", o);
705 XmlElementArrayInt32::WriteXML (numOfFeatures, cardinalityDest,
"CardinalityDest", o);
710 XmlElementArrayInt32::WriteXML (numOfFeatures, destFeatureNums,
"DestFeatureNums", o);
713 if (destFileDesc) destFileDesc
->WriteXML ("DestFileDesc", o
);
718 for (kkint32 x = 0; x < numOfFeatures; ++x)
719 v.push_back ((kkint32)(destWhatToDo[x]));
720 XmlElementVectorInt32::WriteXML (v,
"DestWhatToDo", o);
728 XmlElementArrayInt32::WriteXML (numOfFeatures, srcFeatureNums,
"SrcFeatureNums", o);
744 while (t && (!cancelFlag))
768 else if (
typeid (*e) ==
typeid (XmlElementArrayInt32))
770 XmlElementArrayInt32Ptr xmlArray =
dynamic_cast<XmlElementArrayInt32Ptr> (e);
771 kkuint32 count = xmlArray->Count ();
772 if (count != numOfFeatures)
774 log.Level (-1) << endl
775 <<
"FeatureEncoder::ReadXML ***ERROR*** Variable[" << varName <<
"] Invalid Length[" << count <<
"] Expected[" << numOfFeatures <<
"]" << endl
782 delete cardinalityDest;
783 cardinalityDest = xmlArray->TakeOwnership ();
788 delete destFeatureNums;
789 destFeatureNums = xmlArray->TakeOwnership ();
794 delete srcFeatureNums;
795 srcFeatureNums = xmlArray->TakeOwnership ();
812 else if (varName
.EqualIgnoreCase ("DestWhatToDo") && (
typeid (*e) ==
typeid (XmlElementVectorInt32)))
814 XmlElementVectorInt32Ptr xmlVect =
dynamic_cast<XmlElementVectorInt32Ptr> (e);
815 if (xmlVect && xmlVect->Value ())
817 const VectorInt32& v = *(xmlVect->Value ());
818 if (v.size () != numOfFeatures)
820 log.Level (-1) << endl
821 <<
"FeatureEncoder::ReadXML ***ERROR*** Variable[" << varName <<
"] Invalid Size[" << v.size () <<
"] Expected[" << numOfFeatures <<
"]." << endl
827 destWhatToDo =
new FeWhatToDo[v.size ()];
828 for (kkuint32 x = 0; x < v.size (); ++x)
829 destWhatToDo[x] = (FeWhatToDo)v[x];
839 log.Level (-1) <<
"XmlElementTrainingClassList ***ERROR*** Un-expected Section Element[" << e->SectionName () <<
"]" << endl;
__int16 kkint16
16 bit signed integer.
virtual void ReadXML(XmlStream &s, XmlTagConstPtr tag, VolConstBool &cancelFlag, RunLog &log)
KKStr(kkint32 size)
Creates a KKStr object that pre-allocates space for 'size' characters.
XmlTag(const KKStr &_name, TagTypes _tagType)
KKStr TypeStr(kkint32 fieldNum) const
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
virtual double ToDouble() const
void AddQueue(const FeatureVectorList &examplesToAdd)
Add the contents of 'examplesToAdd' to the end of this list.
Provides a detailed description of the attributes of a dataset.
bool EqualIgnoreCase(const char *s2) const
FeatureVectorListPtr CreateEncodedFeatureVector(FeatureVectorList &srcData)
const KKStr & GetNominalValue(kkint32 code) const
Returns the nominal value for the given ordinal value.
bool AllFieldsAreNumeric() const
Returns true if all fields are numeric, no nominal fields.
void AddAAttribute(const KKB::KKStr &_name, KKMLL::AttributeType _type, bool &alreadyExists)
const float * FeatureData() const
Returns as a pointer to the feature data itself.
virtual KKStr ToKKStr() const
Keeps track of selected features.
FeatureVector(kkint32 _numOfFeatures)
KKStr EncodingMethodToStr(SVM_EncodingMethod encodingMethod)
kkint16 GetNumForClass(MLClassPtr mlClass) const
Represents a "Class" in the Machine Learning Sense.
SVM_EncodingMethod EncodingMethodFromStr(const KKStr &encodingMethodStr)
virtual FeatureVectorListPtr DuplicateListAndContents() const
Creates a duplicate of list and also duplicates it contents.
void TrainWeight(float _trainWeight)
Assign a specific example a higher weight for training purposes.
KKStr operator+(const char *right) const
~FeatureEncoder()
Frees any memory allocated by, and owned by the FeatureEncoder.
const KKStr & FieldName(kkint32 fieldNum) const
FileDescPtr CreateEncodedFileDesc(ostream *o)
unsigned __int32 kkuint32
void AddFeatureData(kkint32 _featureNum, float _featureData)
MLClassPtr PredictedClass() const
FeatureNumList(const FeatureNumList &featureNumList)
Copy constructor.
FeatureVectorListPtr EncodeAllExamples(const FeatureVectorListPtr srcData)
FeatureVectorList(FileDescPtr _fileDesc, bool _owner)
Will create a new empty list of FeatureVector's.
describes a single Feature, Type and possible values.
Container class for FeatureVector derived objects.
virtual kkint32 ToInt32() const
kkuint16 operator[](kkint32 idx) const
Returns back the selected feature.
const KKStr & GetNominalValue(kkint32 fieldNum, kkint32 code) const
virtual void WriteXML(const KKStr &varName, ostream &o) const
XmlElement * XmlElementPtr
void AddAtribute(const KKStr &attributeName, const KKStr &attributeValue)
kkint32 NumOfFeatures() const
XmlTag const * XmlTagConstPtr
Manages the reading and writing of objects in a simple XML format. For a class to be supported by Xml...
XSpacePtr EncodeAExample(FeatureVectorPtr example)
Converts a single example into the svm_problem format.
Binds MLClass objects to the appropriate number that the Learning Algorithm expects.
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of 'std::string' strings.
std::vector< kkint32 > VectorInt32
Vector of signed 32 bit integers.
void EncodeIntoSparseMatrix(FeatureVectorListPtr src, ClassAssignments &assignments, XSpacePtr &xSpace, kkint32 &totalxSpaceUsed, struct svm_problem &prob, RunLog &log)
Compresses 'src' examples, allocating new 'xSpace' data structure.
MLClassPtr MLClass() const
Class that is example is assigned to.
void MLClass(MLClassPtr _mlClass)
Assign a class to this example.
virtual const KKStr & VarName() const
KKStr StrFormatInt(kkint32 val, const char *mask)
const KKStr & Name() const
AttributeType Type() const
void WriteXML(const KKStr &varName, std::ostream &o) const
KKStr operator+(const KKStr &right) const
static FileDescPtr NewContinuousDataOnly(VectorKKStr &_fieldNames)
Creates a simple FileDesc that consists of continuous data only.
static MLClassPtr CreateNewMLClass(const KKStr &_name, kkint32 _classId=-1)
Static method used to create a new instance of a MLClass object.
FeatureVectorPtr EncodeAExample(FileDescPtr encodedFileDesc, FeatureVectorPtr src)
double osGetSystemTimeUsed()
Returns the number of CPU seconds used by current process.
void CompressExamples(FeatureVectorListPtr srcExamples, FeatureVectorListPtr compressedExamples, ClassAssignments &assignments)
Left over from BitReduction days; removed all code except that which processed the NO bit reduction o...
void PredictedClass(MLClassPtr _predictedClass)
void WriteXML(std::ostream &o)
Used for logging messages.
void EncodeProblem(const struct svm_paramater ¶m, struct svm_problem &prob_in, struct svm_problem &prob_out)
FeatureEncoder(FileDescPtr _fileDesc, MLClassPtr _class1, MLClassPtr _class2, const FeatureNumList &_selectedFeatures, SVM_EncodingMethod _encodingMethod, double _c_Param)
Constructs a Feature Encoder object.
float TrainWeight() const
const KKStr & Name() const
void EncodeAExample(FeatureVectorPtr example, svm_node *xSpace, kkint32 &xSpaceUsed)
Converts a single example into the svm_problem format.
virtual TokenTypes TokenType()=0
FileDescPtr Value() const
virtual XmlTokenPtr GetNextToken(VolConstBool &cancelFlag, RunLog &log)
KKException(const KKStr &_exceptionStr)
kkint32 Cardinality() const
Returns back the cardinality of the attribute; the number of possible values it can take...
Represents a Feature Vector of a single example, labeled or unlabeled.
kkint32 MemoryConsumedEstimated() const
void WriteXML(const KKStr &varName, std::ostream &o) const
kkint32 MemoryConsumedEstimated() const
void WriteXML(const KKStr &varName, std::ostream &o) const
const KKMLL::Attribute & GetAAttribute(kkint32 fieldNum) const
XmlElementFileDesc * XmlElementFileDescPtr
volatile const bool VolConstBool