22 using namespace KKMLL;
29 duplicateDataCount (0),
30 duplicateNameCount (0),
40 log.Level (-1) << endl << endl <<
"DuplicateImages::DuplicateImages ***ERROR*** '_examples == NULL'" << endl << endl;
44 FindDuplicates (_examples);
54 duplicateDataCount (0),
55 duplicateNameCount (0),
70 delete nameTree; nameTree = NULL;
71 delete featureDataTree; featureDataTree = NULL;
72 delete dupExamples; dupExamples = NULL;
79 if (nameTree->GetEqual (fv->ExampleFileName ()) != NULL)
92 bool dupsDetected =
false;
93 FeatureVectorList::iterator idx;
94 for (idx = examples->begin (); idx != examples->end (); idx++)
115 FeatureVectorPtr existingNameExample = NULL;
120 existingNameExample = nameTree->GetEqual (osGetRootName (example->ExampleFileName ()));
121 if (!existingNameExample)
122 nameTree->RBInsert (example);
125 FeatureVectorPtr existingDataExample = featureDataTree
->GetEqual (example
);
126 if (!existingDataExample)
129 if ((existingNameExample) || (existingDataExample))
132 if (existingNameExample)
134 duplicateNameCount++;
139 dupExamples->PushOnBack (dupExample);
147 if (existingDataExample)
149 duplicateDataCount++;
150 if (existingDataExample != existingNameExample)
156 dupExamples->PushOnBack (dupExample);
178 for (
auto idx: *examples)
180 FeatureVectorPtr example = idx;
181 AddSingleExample (example);
195 bool allowDupsInSameClass,
199 log.Level (10) <<
"DuplicateImageList::PurgeDuplicates" << endl;
205 map<FeatureVectorPtr,KKStr> deletedDictionary;
206 map<FeatureVectorPtr,KKStr>::iterator deletedDictionaryIdx;
211 DuplicateImageList::iterator dIDX = dupExamples->begin ();
213 for (dIDX = dupExamples->begin (); dIDX != dupExamples->end (); ++dIDX, ++dupSetCount)
217 log.Level (20) <<
"PurgeDuplicates Duplicate Set[" << dupSet->FirstExampleAdded ()->ExampleFileName () <<
"]" << endl;
220 FeatureVectorPtr exampleToKeep = NULL;
224 if (allowDupsInSameClass)
230 FeatureVectorList::iterator iIDX = examplesInSet->begin ();
232 for (iIDX = examplesInSet->begin (); iIDX != examplesInSet->end (); ++iIDX)
234 FeatureVectorPtr example = *iIDX;
238 if (example == exampleToKeep)
240 log.Level (30) <<
"PurgeDuplicates Keeping [" << exampleToKeep->ExampleFileName () <<
"]." << endl;
242 *report << example->ExampleFileName () <<
"\t" <<
"Class" <<
"\t" << example->MLClassName () <<
"\t" <<
"Duplicate retained." << endl;
246 bool alreadyDeleted =
false;
247 deletedDictionaryIdx = deletedDictionary.find (example);
248 if (deletedDictionaryIdx != deletedDictionary.end ())
252 errMsg <<
"Example: " << deletedDictionaryIdx->second <<
" Already Been Deleted.";
253 log.Level (-1) << endl <<
"DuplicateImages::PurgeDuplicates ***ERROR*** " << errMsg << endl <<endl;
254 alreadyDeleted =
true;
259 deletedDictionary.insert (pair<FeatureVectorPtr,KKStr> (example, example->ExampleFileName ()));
261 log.Level (30) <<
"PurgeDuplicates Deleting [" << example->ExampleFileName () <<
"]." << endl;
263 *report << example->ExampleFileName () <<
"\t" <<
"Class" <<
"\t" << example->MLClassName () <<
"\t" <<
"Duplicate deleted." << endl;
264 examples->DeleteEntry (example);
265 if (examples->Owner ())
280 log.Level (10) <<
"DuplicateImages::ListOfExamplesToDelete" << endl;
284 DuplicateImageList::iterator dIDX = dupExamples->begin ();
286 for (dIDX = dupExamples->begin (); dIDX != dupExamples->end (); ++dIDX)
290 log.Level (20) <<
"ListOfExamplesToDelete Duplicate Set[" << dupSet->FirstExampleAdded ()->ExampleFileName () <<
"]" << endl;
293 FeatureVectorPtr exampleToKeep = NULL;
300 FeatureVectorList::iterator iIDX = examplesInSet->begin ();
302 for (iIDX = examplesInSet->begin (); iIDX != examplesInSet->end (); ++iIDX)
304 FeatureVectorPtr example = *iIDX;
308 if (example == exampleToKeep)
310 log.Level (30) <<
"ListOfExamplesToDelete Keeping [" << exampleToKeep->ExampleFileName () <<
"]." << endl;
314 log.Level (30) <<
"ListOfExamplesToDelete Deleting [" << example->ExampleFileName () <<
"]." << endl;
320 return examplesToDelete;
328 o <<
"Number of Duplicate Groups [" << dupExamples->QueueSize () <<
"]" << endl;
332 for (
auto dupExampleSet: *dupExamples)
334 const FeatureVectorListPtr dupList = dupExampleSet->DuplicatedImages ();
336 o <<
"Group[" << groupNum <<
"] Contains [" << dupList->QueueSize () <<
"] Duplicates." << endl;
338 kkint32 numOnLine = 0;
340 for (
auto fvIDX: *dupList)
350 o << fvIDX->ExampleFileName () <<
"[" << fvIDX->MLClassName () <<
"]";
364 return (dupExamples->QueueSize () > 0);
370 FeatureVectorPtr _image1,
371 FeatureVectorPtr _image2,
374 fileDesc (_fileDesc),
376 firstImageAdded (_image1)
378 duplicatedImages.PushOnBack (_image1);
379 duplicatedImages.PushOnBack (_image2);
390 duplicatedImages.PushOnBack (example);
399 bool allTheSameClass =
true;
401 MLClassPtr mlClass = duplicatedImages.IdxToPtr (0)->MLClass ();
403 FeatureVectorList::iterator iIDX = duplicatedImages.begin ();
405 while ((iIDX != duplicatedImages.end ()) && (allTheSameClass))
407 allTheSameClass = (*iIDX)->MLClass () == mlClass;
411 return allTheSameClass;
420 return (duplicatedImages.PtrToIdx (example) >= 0);
428 kkint32 smallestScanLine = 99999999;
429 FeatureVectorPtr imageWithSmallestScanLine = NULL;
432 for (FeatureVectorList::iterator iIDX = duplicatedImages.begin (); iIDX != duplicatedImages.end (); iIDX++)
434 FeatureVectorPtr i = *iIDX;
449 scanLine = atoi (scanLineStr.Str ());
461 scanLine = atoi (scanLineStr.Str ());
465 if ((scanLine < smallestScanLine) ||
466 (imageWithSmallestScanLine == NULL)
469 smallestScanLine = scanLine;
470 imageWithSmallestScanLine = i;
474 return imageWithSmallestScanLine;
495 for (DuplicateImageList::iterator idx = begin (); idx != end (); idx++)
499 return dupExampleSet;
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
DuplicateImageList(bool _owner)
ImageFeaturesNameIndexed()
DuplicateImages(FeatureVectorListPtr _examples, RunLog &_log)
You would use this instance to search for duplicates in the list of 'examples'.
DuplicateImageList * DuplicateImageListPtr
const FileDescPtr FileDesc() const
bool operator==(const char *rtStr) const
bool AlreadyHaveExample(FeatureVectorPtr example)
DuplicateImagePtr LocateByImage(FeatureVectorPtr example)
DuplicateImage * DuplicateImagePtr
bool DuplicatesFound() const
DuplicateImage(FileDescPtr _fileDesc, FeatureVectorPtr _image1, FeatureVectorPtr _image2, RunLog &_log)
bool AddExamples(FeatureVectorListPtr examples)
Will add all the examples; be careful of ownership.
void AddADuplicate(FeatureVectorPtr example)
FeatureVectorList(FileDescPtr _fileDesc, bool _owner)
Will create a new empty list of FeatureVector's.
DuplicateImages(FileDescPtr _fileDesc, RunLog &_log)
void RBInsert(FeatureVectorPtr example)
Container class for FeatureVector derived objects.
DuplicateImageListPtr DupExamples() const
void ReportDuplicates(std::ostream &o)
KKStr SubStrPart(kkint32 firstChar, kkint32 lastChar) const
returns a SubString consisting of all characters starting at index 'firstChar' and ending at 'lastInd...
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of 'std::string' strings.
void Upper()
Converts all characters in string to their Upper case equivalents via 'toupper'.
kkint32 LocateLastOccurrence(char ch) const
Returns index of last occurrence of 'ch' otherwise -1.
FeatureVectorPtr ExampleWithSmallestScanLine()
void PurgeDuplicates(FeatureVectorListPtr examples, bool allowDupsInSameClass, std::ostream *report)
Delete duplicate examples from FeatureVectorList structure provided in constructor.
DuplicateImagePtr AddSingleExample(FeatureVectorPtr example)
Add one more FeatureVector to the list.
bool ExampleInDetector(FeatureVectorPtr fv)
Detects duplicate images in a given FeaureVectorList objects.
Used for logging messages.
void EncodeProblem(const struct svm_paramater ¶m, struct svm_problem &prob_in, struct svm_problem &prob_out)
FeatureVectorListPtr ListOfExamplesToDelete()
FeatureVectorPtr GetEqual(FeatureVectorPtr example)
KKStr SubStrPart(kkint32 firstChar) const
returns a SubString consisting of all characters starting at index 'firstChar' until the end of the s...
const FeatureVectorListPtr DuplicatedImages()
const KKStr & ExampleFileName() const
Name of file that this FeatureVector was computed from.
KKStr osGetRootName(const KKStr &fullFileName)
ImageFeaturesDataIndexed()