KSquare Utilities
DuplicateImages.cpp
Go to the documentation of this file.
1 #include "FirstIncludes.h"
2 #include <stdio.h>
3 #include <vector>
4 #include <iostream>
5 #include <fstream>
6 #include "MemoryDebug.h"
7 using namespace std;
8 
9 #include "KKBaseTypes.h"
10 #include "OSservices.h"
11 using namespace KKB;
12 
13 
14 #include "DuplicateImages.h"
15 
16 
17 //#include "FeatureFileIOKK.h"
18 //#include "FeatureFileIOPices.h"
19 #include "FeatureVector.h"
22 using namespace KKMLL;
23 
24 
25 DuplicateImages::DuplicateImages (FeatureVectorListPtr _examples,
26  RunLog& _log
27  ):
28  duplicateCount (0),
29  duplicateDataCount (0),
30  duplicateNameCount (0),
31  dupExamples (new DuplicateImageList (true)),
32  featureDataTree (new ImageFeaturesDataIndexed ()),
33  fileDesc (NULL),
34  log (_log),
35  nameTree (new ImageFeaturesNameIndexed ())
36 
37 {
38  if (!_examples)
39  {
40  log.Level (-1) << endl << endl << "DuplicateImages::DuplicateImages ***ERROR*** '_examples == NULL'" << endl << endl;
41  return;
42  }
43  fileDesc = _examples->FileDesc ();
44  FindDuplicates (_examples);
45 }
46 
47 
48 
49 
51  RunLog& _log
52  ):
53  duplicateCount (0),
54  duplicateDataCount (0),
55  duplicateNameCount (0),
56  dupExamples (new DuplicateImageList (true)),
57  featureDataTree (new ImageFeaturesDataIndexed ()),
58  fileDesc (_fileDesc),
59  log (_log),
60  nameTree (new ImageFeaturesNameIndexed ())
61 
62 {
63 }
64 
65 
66 
67 
69 {
70  delete nameTree; nameTree = NULL;
71  delete featureDataTree; featureDataTree = NULL;
72  delete dupExamples; dupExamples = NULL;
73 }
74 
75 
76 
77 bool DuplicateImages::ExampleInDetector (FeatureVectorPtr fv)
78 {
79  if (nameTree->GetEqual (fv->ExampleFileName ()) != NULL)
80  return true;
81 
82  if (featureDataTree->GetEqual (fv) != NULL)
83  return true;
84 
85  return false;
86 } /* ExampleInDetector */
87 
88 
89 
90 bool DuplicateImages::AddExamples (FeatureVectorListPtr examples)
91 {
92  bool dupsDetected = false;
93  FeatureVectorList::iterator idx;
94  for (idx = examples->begin (); idx != examples->end (); idx++)
95  {
96  DuplicateImagePtr dupExample = AddSingleExample (*idx);
97  if (dupExample)
98  dupsDetected = true;
99  }
100 
101  return dupsDetected;
102 } /* AddExamples */
103 
104 
105 
106 
107 /**
108  *@brief Will add one more example to list and if it turns out to be a duplicate will return pointer to a "DuplicateImage" structure
109  * that will contain a list of all images that it is duplicate to. If no duplicate found will then return a NULL pointer.
110  */
112 {
113  DuplicateImagePtr dupExample = NULL;
114 
115  FeatureVectorPtr existingNameExample = NULL;
116 
117  const KKStr& imageFileName = example->ExampleFileName ();
118  if (!imageFileName.Empty ())
119  {
120  existingNameExample = nameTree->GetEqual (osGetRootName (example->ExampleFileName ()));
121  if (!existingNameExample)
122  nameTree->RBInsert (example);
123  }
124 
125  FeatureVectorPtr existingDataExample = featureDataTree->GetEqual (example);
126  if (!existingDataExample)
127  featureDataTree->RBInsert (example);
128 
129  if ((existingNameExample) || (existingDataExample))
130  {
131  duplicateCount++;
132  if (existingNameExample)
133  {
134  duplicateNameCount++;
135  dupExample = dupExamples->LocateByImage (existingNameExample);
136  if (!dupExample)
137  {
138  dupExample = new DuplicateImage (fileDesc, existingNameExample, example, log);
139  dupExamples->PushOnBack (dupExample);
140  }
141  else
142  {
143  dupExample->AddADuplicate (example);
144  }
145  }
146 
147  if (existingDataExample)
148  {
149  duplicateDataCount++;
150  if (existingDataExample != existingNameExample)
151  {
152  dupExample = dupExamples->LocateByImage (existingDataExample);
153  if (!dupExample)
154  {
155  dupExample = new DuplicateImage (fileDesc, existingDataExample, example, log);
156  dupExamples->PushOnBack (dupExample);
157  }
158  else
159  {
160  dupExample->AddADuplicate (example);
161  }
162  }
163  }
164  }
165 
166  return dupExample;
167 } /* AddSingleExample */
168 
169 
170 
171 
172 
173 void DuplicateImages::FindDuplicates (FeatureVectorListPtr examples)
174 {
175  if (!examples)
176  return;
177 
178  for (auto idx: *examples)
179  {
180  FeatureVectorPtr example = idx;
181  AddSingleExample (example);
182  }
183 } /* FindDuplicates */
184 
185 
186 
187 /**
188  *@brief Delete duplicate examples from FeatureVectorList structure provided in constructor.
189  *@details
190  * If duplicates are in more than one class then all will be deleted.
191  * if duplicates are in a single class then one with smallest scan line will be kept
192  * while all others will be deleted.
193  */
194 void DuplicateImages::PurgeDuplicates (FeatureVectorListPtr examples,
195  bool allowDupsInSameClass,
196  ostream* report
197  )
198 {
199  log.Level (10) << "DuplicateImageList::PurgeDuplicates" << endl;
200 
201 
202  // To make sure that we do not delete the same example Twice I added 'deletedDictionary' below.
203  // if will track all examples by address that have been deleted. I did this because a bug in
204  // the duplicate detector routine had the same example added to to different groups of duplicates.
205  map<FeatureVectorPtr,KKStr> deletedDictionary; // List of examples already deleted.
206  map<FeatureVectorPtr,KKStr>::iterator deletedDictionaryIdx;
207 
208  DuplicateImageListPtr dupExamples = DupExamples ();
209 
210  kkint32 dupSetCount = 0;
211  DuplicateImageList::iterator dIDX = dupExamples->begin ();
212 
213  for (dIDX = dupExamples->begin (); dIDX != dupExamples->end (); ++dIDX, ++dupSetCount)
214  {
215  DuplicateImagePtr dupSet = *dIDX;
216 
217  log.Level (20) << "PurgeDuplicates Duplicate Set[" << dupSet->FirstExampleAdded ()->ExampleFileName () << "]" << endl;
218 
219  FeatureVectorListPtr examplesInSet = dupSet->DuplicatedImages ();
220  FeatureVectorPtr exampleToKeep = NULL;
221 
222  if (dupSet->AllTheSameClass ())
223  {
224  if (allowDupsInSameClass)
225  continue;
226  else
227  exampleToKeep = dupSet->ExampleWithSmallestScanLine ();
228  }
229 
230  FeatureVectorList::iterator iIDX = examplesInSet->begin ();
231 
232  for (iIDX = examplesInSet->begin (); iIDX != examplesInSet->end (); ++iIDX)
233  {
234  FeatureVectorPtr example = *iIDX;
235  if (!example)
236  continue;
237 
238  if (example == exampleToKeep)
239  {
240  log.Level (30) << "PurgeDuplicates Keeping [" << exampleToKeep->ExampleFileName () << "]." << endl;
241  if (report)
242  *report << example->ExampleFileName () << "\t" << "Class" << "\t" << example->MLClassName () << "\t" << "Duplicate retained." << endl;
243  }
244  else
245  {
246  bool alreadyDeleted = false;
247  deletedDictionaryIdx = deletedDictionary.find (example);
248  if (deletedDictionaryIdx != deletedDictionary.end ())
249  {
250  // AHA We are getting ready to delete an entry we have already deleted ????
251  KKStr errMsg (1024);
252  errMsg << "Example: " << deletedDictionaryIdx->second << " Already Been Deleted.";
253  log.Level (-1) << endl << "DuplicateImages::PurgeDuplicates ***ERROR*** " << errMsg << endl <<endl;
254  alreadyDeleted = true;
255  }
256 
257  if (!alreadyDeleted)
258  {
259  deletedDictionary.insert (pair<FeatureVectorPtr,KKStr> (example, example->ExampleFileName ()));
260 
261  log.Level (30) << "PurgeDuplicates Deleting [" << example->ExampleFileName () << "]." << endl;
262  if (report)
263  *report << example->ExampleFileName () << "\t" << "Class" << "\t" << example->MLClassName () << "\t" << "Duplicate deleted." << endl;
264  examples->DeleteEntry (example);
265  if (examples->Owner ())
266  delete example;
267  }
268  }
269  }
270  }
271 } /* PurgeDuplicates */
272 
273 
274 
275 
276 FeatureVectorListPtr DuplicateImages::ListOfExamplesToDelete ()
277 {
278  FeatureVectorListPtr examplesToDelete = new FeatureVectorList (fileDesc, false);
279 
280  log.Level (10) << "DuplicateImages::ListOfExamplesToDelete" << endl;
281 
282  DuplicateImageListPtr dupExamples = DupExamples ();
283 
284  DuplicateImageList::iterator dIDX = dupExamples->begin ();
285 
286  for (dIDX = dupExamples->begin (); dIDX != dupExamples->end (); ++dIDX)
287  {
288  DuplicateImagePtr dupSet = *dIDX;
289 
290  log.Level (20) << "ListOfExamplesToDelete Duplicate Set[" << dupSet->FirstExampleAdded ()->ExampleFileName () << "]" << endl;
291 
292  FeatureVectorListPtr examplesInSet = dupSet->DuplicatedImages ();
293  FeatureVectorPtr exampleToKeep = NULL;
294 
295  if (dupSet->AllTheSameClass ())
296  {
297  exampleToKeep = dupSet->ExampleWithSmallestScanLine ();
298  }
299 
300  FeatureVectorList::iterator iIDX = examplesInSet->begin ();
301 
302  for (iIDX = examplesInSet->begin (); iIDX != examplesInSet->end (); ++iIDX)
303  {
304  FeatureVectorPtr example = *iIDX;
305  if (!example)
306  continue;
307 
308  if (example == exampleToKeep)
309  {
310  log.Level (30) << "ListOfExamplesToDelete Keeping [" << exampleToKeep->ExampleFileName () << "]." << endl;
311  }
312  else
313  {
314  log.Level (30) << "ListOfExamplesToDelete Deleting [" << example->ExampleFileName () << "]." << endl;
315  examplesToDelete->PushOnBack (example);
316  }
317  }
318  }
319 
320  return examplesToDelete;
321 } /* ListOfExamplesToDelete */
322 
323 
324 
325 
327 {
328  o << "Number of Duplicate Groups [" << dupExamples->QueueSize () << "]" << endl;
329  kkint32 groupNum = 0;
330 
331  //for (DuplicateImageList::iterator idx = dupExamples->begin (); idx != dupExamples->end (); idx++)
332  for (auto dupExampleSet: *dupExamples)
333  {
334  const FeatureVectorListPtr dupList = dupExampleSet->DuplicatedImages ();
335 
336  o << "Group[" << groupNum << "] Contains [" << dupList->QueueSize () << "] Duplicates." << endl;
337 
338  kkint32 numOnLine = 0;
339  //FeatureVectorList::const_iterator fvIDX;
340  for (auto fvIDX: *dupList) // = dupList->begin (); fvIDX != dupList->end (); ++fvIDX)
341  {
342  if (numOnLine > 8)
343  {
344  o << endl;
345  numOnLine = 0;
346  }
347 
348  if (numOnLine > 0)
349  o << "\t";
350  o << fvIDX->ExampleFileName () << "[" << fvIDX->MLClassName () << "]";
351 
352  numOnLine++;
353  }
354  o << endl << endl;;
355 
356  groupNum++;
357  }
358 } /* ReportDuplicates */
359 
360 
361 
363 {
364  return (dupExamples->QueueSize () > 0);
365 }
366 
367 
368 
370  FeatureVectorPtr _image1, /**< image1, will be the one that we was already in the index structures. */
371  FeatureVectorPtr _image2,
372  RunLog& _log
373  ):
374  fileDesc (_fileDesc),
375  duplicatedImages (_fileDesc, false),
376  firstImageAdded (_image1)
377 {
378  duplicatedImages.PushOnBack (_image1);
379  duplicatedImages.PushOnBack (_image2);
380 }
381 
382 
384 {
385 }
386 
387 
388 void DuplicateImage::AddADuplicate (FeatureVectorPtr example)
389 {
390  duplicatedImages.PushOnBack (example);
391 } /* AddADuplicate */
392 
393 
394 
395 
396 
398 {
399  bool allTheSameClass = true;
400 
401  MLClassPtr mlClass = duplicatedImages.IdxToPtr (0)->MLClass ();
402 
403  FeatureVectorList::iterator iIDX = duplicatedImages.begin ();
404 
405  while ((iIDX != duplicatedImages.end ()) && (allTheSameClass))
406  {
407  allTheSameClass = (*iIDX)->MLClass () == mlClass;
408  iIDX++;
409  }
410 
411  return allTheSameClass;
412 } /* AllTheSameClass */
413 
414 
415 
416 
417 
418 bool DuplicateImage::AlreadyHaveExample (FeatureVectorPtr example)
419 {
420  return (duplicatedImages.PtrToIdx (example) >= 0);
421 }
422 
423 
424 
425 
427 {
428  kkint32 smallestScanLine = 99999999;
429  FeatureVectorPtr imageWithSmallestScanLine = NULL;
430 
431 
432  for (FeatureVectorList::iterator iIDX = duplicatedImages.begin (); iIDX != duplicatedImages.end (); iIDX++)
433  {
434  FeatureVectorPtr i = *iIDX;
435  // First lets derive scan line from example file name
436 
438  rootName.Upper ();
439 
440  kkint32 scanLine = 9999999;
441 
442  if (rootName.SubStrPart (0, 4) == "FRAME")
443  {
444  // Scan line will be last seq number in name.
445  kkint32 x = rootName.LocateLastOccurrence ('_');
446  if (x > 0)
447  {
448  KKStr scanLineStr = rootName.SubStrPart (x + 1);
449  scanLine = atoi (scanLineStr.Str ());
450  }
451  }
452  else
453  {
454  // Scan should be 2nd to last seq number in name.
455  kkint32 x = rootName.LocateLastOccurrence ('_');
456  if (x > 0)
457  {
458  KKStr workStr = rootName.SubStrPart (0, x - 1);
459  kkint32 x = workStr.LocateLastOccurrence ('_');
460  KKStr scanLineStr = workStr.SubStrPart (x + 1);
461  scanLine = atoi (scanLineStr.Str ());
462  }
463  }
464 
465  if ((scanLine < smallestScanLine) ||
466  (imageWithSmallestScanLine == NULL)
467  )
468  {
469  smallestScanLine = scanLine;
470  imageWithSmallestScanLine = i;
471  }
472  }
473 
474  return imageWithSmallestScanLine;
475 } /* ImageWithSmallestScalLine */
476 
477 
478 
479 
482 {
483 }
484 
485 
486 
488 {
489 }
490 
491 
492 
494 {
495  for (DuplicateImageList::iterator idx = begin (); idx != end (); idx++)
496  {
497  DuplicateImagePtr dupExampleSet = *idx;
498  if (dupExampleSet->AlreadyHaveExample (example))
499  return dupExampleSet;
500  }
501 
502  return NULL;
503 } /* LocateByImage */
void PushOnBack(FeatureVectorPtr image)
Overloading the PushOnBack function in KKQueue so we can monitor the Version and Sort Order...
MLClass * MLClassPtr
Definition: MLClass.h:46
__int32 kkint32
Definition: KKBaseTypes.h:88
DuplicateImages(FeatureVectorListPtr _examples, RunLog &_log)
You would use this instance to search for duplicates in the list of &#39;examples&#39;.
DuplicateImageList * DuplicateImageListPtr
const FileDescPtr FileDesc() const
bool operator==(const char *rtStr) const
Definition: KKStr.cpp:1588
bool AlreadyHaveExample(FeatureVectorPtr example)
DuplicateImagePtr LocateByImage(FeatureVectorPtr example)
DuplicateImage * DuplicateImagePtr
DuplicateImage(FileDescPtr _fileDesc, FeatureVectorPtr _image1, FeatureVectorPtr _image2, RunLog &_log)
bool AddExamples(FeatureVectorListPtr examples)
Will add all the examples; be careful of ownership.
void AddADuplicate(FeatureVectorPtr example)
FeatureVectorList(FileDescPtr _fileDesc, bool _owner)
Will create a new empty list of FeatureVector&#39;s.
DuplicateImages(FileDescPtr _fileDesc, RunLog &_log)
void RBInsert(FeatureVectorPtr example)
Container class for FeatureVector derived objects.
KKTHread * KKTHreadPtr
DuplicateImageListPtr DupExamples() const
void ReportDuplicates(std::ostream &o)
bool Empty() const
Definition: KKStr.h:241
KKStr SubStrPart(kkint32 firstChar, kkint32 lastChar) const
returns a SubString consisting of all characters starting at index &#39;firstChar&#39; and ending at &#39;lastInd...
Definition: KKStr.cpp:2802
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of &#39;std::string&#39; strings.
Definition: KKStr.cpp:1082
void Upper()
Converts all characters in string to their Upper case equivalents via &#39;toupper&#39;.
Definition: KKStr.cpp:2461
kkint32 LocateLastOccurrence(char ch) const
Returns index of last occurrence of &#39;ch&#39; otherwise -1.
Definition: KKStr.cpp:2118
FeatureVectorPtr ExampleWithSmallestScanLine()
FileDesc * FileDescPtr
void PurgeDuplicates(FeatureVectorListPtr examples, bool allowDupsInSameClass, std::ostream *report)
Delete duplicate examples from FeatureVectorList structure provided in constructor.
DuplicateImagePtr AddSingleExample(FeatureVectorPtr example)
Add one more FeatureVector to the list.
bool ExampleInDetector(FeatureVectorPtr fv)
Detects duplicate images in a given FeaureVectorList objects.
Used for logging messages.
Definition: RunLog.h:49
void EncodeProblem(const struct svm_paramater &param, struct svm_problem &prob_in, struct svm_problem &prob_out)
FeatureVectorListPtr ListOfExamplesToDelete()
FeatureVectorPtr GetEqual(FeatureVectorPtr example)
KKStr SubStrPart(kkint32 firstChar) const
returns a SubString consisting of all characters starting at index &#39;firstChar&#39; until the end of the s...
Definition: KKStr.cpp:2780
const FeatureVectorListPtr DuplicatedImages()
const KKStr & ExampleFileName() const
Name of file that this FeatureVector was computed from.
KKStr osGetRootName(const KKStr &fullFileName)