KSquare Utilities
XmlTokenizer.cpp
Go to the documentation of this file.
1 /* XmlTokenizer.cpp -- Class to Manage Token Parsing
2  * Copyright (C) 1994-2014 Kurt Kramer
3  * For conditions of distribution and use, see copyright notice in KKB.h
4  */
5 #include "FirstIncludes.h"
6 #include <iostream>
7 #include <fstream>
8 #include <map>
9 #include <vector>
10 #include <string.h>
11 #include "MemoryDebug.h"
12 using namespace std;
13 
14 
15 #include "XmlTokenizer.h"
16 #include "KKStr.h"
17 #include "TokenBuffer.h"
18 using namespace KKB;
19 
20 //#define _LogStream_
21 
22 
24 
25  atEndOfFile (false),
26  in (_in),
27  tokenList (),
28  weOwnTokenBuffer (false)
29 
30 #if defined(_LogStream_)
31  ,
32  logger1 ("C:\\Temp\\XmlTokenizer-1.txt"),
33  logger2 ("C:\\Temp\\XmlTokenizer-2.txt")
34 #endif
35 {
36  Initialize ();
37 }
38 
39 
40 
42 
43  atEndOfFile (false),
44  in (NULL),
45  tokenList (),
46  weOwnTokenBuffer (false)
47 #if defined(_LogStream_)
48  ,
49  logger1 ("C:\\Temp\\XmlTokenizer-1.txt"),
50  logger2 ("C:\\Temp\\XmlTokenizer-2.txt")
51 #endif
52 {
53  in = new TokenBufferStr (_str);
54  weOwnTokenBuffer = true;
55  Initialize ();
56 }
57 
58 
59 
60 XmlTokenizer::XmlTokenizer (const KKStr& _fileName,
61  bool& _fileOpened
62  ):
63 
64  atEndOfFile (false),
65  in (NULL),
66  tokenList (),
67  weOwnTokenBuffer (false)
68 #if defined(_LogStream_)
69  ,
70  logger1 ("C:\\Temp\\XmlTokenizer-1.txt"),
71  logger2 ("C:\\Temp\\XmlTokenizer-2.txt")
72 #endif
73 {
74  in = new TokenBufferStream (_fileName);
75  _fileOpened = (in->Valid ());
76  if (_fileOpened)
77  {
78  weOwnTokenBuffer = true;
79  Initialize ();
80  }
81 }
82 
83 
84 
85 
87 {
88  if (weOwnTokenBuffer)
89  {
90  delete in;
91  in = NULL;
92  }
93 }
94 
95 
96 
97 
98 void XmlTokenizer::Initialize ()
99 {
100  entityMap.insert (pair<KKStr,char> ("quot",'"'));
101  entityMap.insert (pair<KKStr,char> ("amp", '&'));
102  entityMap.insert (pair<KKStr,char> ("apos",'\''));
103  entityMap.insert (pair<KKStr,char> ("lt", '<'));
104  entityMap.insert (pair<KKStr,char> ("gt", '>'));
105  entityMap.insert (pair<KKStr,char> ("tab", '\t'));
106  entityMap.insert (pair<KKStr,char> ("lf", '\n'));
107  entityMap.insert (pair<KKStr,char> ("cr", '\r'));
108 
109  GetNextChar ();
110 
111  tokenListLen = 10;
112 
113  while ((tokenList.size () < tokenListLen) && (!atEndOfFile))
114  {
115  ReadInNextLogicalToken ();
116  }
117 } /* Initialize */
118 
119 
120 
121 
122 char XmlTokenizer::LookUpEntity (const KKStr& entityName) const
123 {
124  map<KKStr,char>::const_iterator idx;
125  idx = entityMap.find (entityName);
126  if (idx == entityMap.end ())
127  return 0;
128  else
129  return idx->second;
130 }
131 
132 
133 
134 
136 {
137  while ((tokenList.size () < 1) && (!atEndOfFile))
138  ReadInNextLogicalToken ();
139 
140  if (tokenList.size () < 1)
141  {
142  #if defined(_LogStream_)
143  logger2 << "GetNextToken return NULL" << endl;
144  logger2.flush ();
145  #endif
146  return NULL;
147  }
148 
149  kkuint32 s = tokenList.size ();
150 
151  KKStrPtr t = tokenList.front ();
152  tokenList.pop_front ();
153 
154  #if defined(_LogStream_)
155  logger2 << "GetNextToken size[" << s << "] :" << (t ? (*t) : "NULL") << endl;
156  logger2.flush ();
157  #endif
158 
159  return t;
160 } /* GetNextToken */
161 
162 
163 
164 /**
165  @brief Will return a list of tokens up to and including the first occurrence if 'delToken'.
166  */
167 KKStrListPtr XmlTokenizer::GetNextTokens (const KKStr& delToken)
168 {
169  if (delToken.Empty ())
170  return NULL;
171 
172  if (atEndOfFile && (tokenList.size () < 1))
173  return NULL;
174 
175  KKStrPtr t = GetNextToken ();
176  if (t == NULL)
177  return NULL;
178 
179  KKStrListPtr tokens = new KKStrList (true);
180  while ((t != NULL) && (*t != delToken))
181  {
182  tokens->PushOnBack (t);
183  t = GetNextToken ();
184  }
185 
186  if (t)
187  tokens->PushOnBack (t);
188 
189  return tokens;
190 } /* GetNextTokens */
191 
192 
193 
194 void XmlTokenizer::PushTokenOnFront (KKStrPtr t)
195 {
196  tokenList.push_front (t);
197 }
198 
199 
200 
201 
202 KKStrConstPtr XmlTokenizer::Peek (kkuint32 idx)
203 {
204  while ((tokenList.size () < (idx + 1)) && !atEndOfFile)
205  ReadInNextLogicalToken ();
206 
207  if (idx >= tokenList.size ())
208  {
209  #if defined(_LogStream_)
210  logger2 << "Peek idx[" << idx << "] returning NULL" << endl;
211  #endif
212  return NULL;
213  }
214 
215  #if defined(_LogStream_)
216  logger2 << "Peek idx[" << idx << "] :" << *(tokenList[idx]) << endl;
217  #endif
218 
219  return tokenList[idx];
220 } /* Peek */
221 
222 
223 
225 {
226 // if (tokenList.QueueSize () == 0)
227 // return true;
228  while ((tokenList.size () < 1) && (!atEndOfFile))
229  ReadInNextLogicalToken ();
230 
231  return (tokenList.size () < 1);
232 } /* EndOfFile */
233 
234 
235 
236 char XmlTokenizer::GetNextChar ()
237 {
238  if (atEndOfFile)
239  {
240  firstChar = 0;
241  }
242  else if (in->EndOfFile ())
243  {
244  atEndOfFile = true;
245  firstChar = 0;
246  #if defined(_LogStream_)
247  logger1 << endl << "GetNextChar atEndOfFile = true;" << endl;
248  #endif
249  }
250  else
251  {
252  firstChar = in->GetNextChar ();
253  if (in->EndOfFile ())
254  {
255  atEndOfFile = true;
256  firstChar = 0;
257  #if defined(_LogStream_)
258  logger1 << endl << "GetNextChar atEndOfFile = true;" << endl;
259  #endif
260  }
261  else
262  {
263  #if defined(_LogStream_)
264  logger1 << firstChar;
265  #endif
266  if (firstChar == '\r')
267  {
268  if (in->PeekNextChar () == '\n')
269  firstChar = in->GetNextChar ();
270  }
271  }
272  }
273 
274  #if defined(_LogStream_)
275  logger1.flush ();
276  #endif
277 
278  return firstChar;
279 } /* GetNextChar */
280 
281 
282 
283 
284 
285 void XmlTokenizer::ReadInNextLogicalToken ()
286 {
287  KKStrPtr t = GetNextTokenRaw ();
288 
289  if (t == NULL)
290  {
291  //tokenList.PushOnBack (new Token (tokEndOfFile, "EndOfFile"));
292  }
293  else
294  {
295  tokenList.push_back (t);
296  }
297 
298  #if defined(_LogStream_)
299  logger2 << "ReadInNextLogicalToken size[" << tokenList.size () << "] :" << (t ? (*t) : "NULL RETURNED") << endl;
300  logger2.flush ();
301  #endif
302 } /* ReadInNextLogicalToken */
303 
304 
305 
306 bool XmlTokenizer::WhiteSpaceChar (char c) const
307 {
308  if (strchr (" ", c) == NULL)
309  return false;
310  else
311  return true;
312 } /* WhiteSpaceChar */
313 
314 
315 
316 
317 KKStrPtr XmlTokenizer::GetNextTokenRaw ()
318 {
319  if (atEndOfFile)
320  return NULL;
321 
322  // Lets skip whitespace
323  while (WhiteSpaceChar (firstChar) && (!atEndOfFile))
324  GetNextChar ();
325  if (atEndOfFile)
326  {
327  return NULL;
328  }
329 
330  KKStrPtr nextRawToken = NULL;
331 
332  if (firstChar == '<')
333  {
334  // We are start of tag token
335  nextRawToken = ProcessTagToken ();
336  }
337 
338  else
339  {
340  nextRawToken = ProcessBodyToken ();
341  }
342 
343 
344  return nextRawToken;
345 } /* GetNextTokenRaw */
346 
347 
348 
349 
350 KKStrPtr XmlTokenizer::ProcessTagToken ()
351 {
352  KKStrPtr token = new KKStr(100);
353  token->Append (firstChar);
354  GetNextChar ();
355 
356  while ((!atEndOfFile) && (firstChar != '>'))
357  {
358  if ((firstChar == '"') || (firstChar == '\''))
359  {
360  // We are starting a quote; will scan characters literately until we reach end of quote */
361  char endingQuoteChar = firstChar;
362  token->Append (firstChar);
363  GetNextChar ();
364 
365  while ((!atEndOfFile) && (firstChar != endingQuoteChar))
366  {
367  if (firstChar == '\\')
368  {
369  GetNextChar ();
370  switch (firstChar)
371  {
372  case 't': firstChar = '\t'; break;
373  case 'n': firstChar = '\n'; break;
374  case 'r': firstChar = '\r'; break;
375  case '0': firstChar = '\0'; break;
376  case '\\': firstChar = '\\'; break;
377  case '"': firstChar = '"'; break;
378  }
379  }
380  token->Append (firstChar);
381  GetNextChar ();
382  }
383 
384  if (firstChar == endingQuoteChar)
385  {
386  token->Append (firstChar);
387  GetNextChar ();
388  }
389  }
390  else
391  {
392  token->Append (firstChar);
393  GetNextChar ();
394  }
395  }
396 
397  if (!atEndOfFile)
398  {
399  token->Append (firstChar);
400  GetNextChar ();
401 
402  // If there is a trailing carriage-return line-feed or just line-feed; we want to skip past them.
403  if (firstChar == '\r') GetNextChar ();
404  if (firstChar == '\n') GetNextChar ();
405  }
406 
407  return token;
408 } /* ProcessTagToken */
409 
410 
411 
412 
413 /**
414  *@brief Processes a XML entity such as "&lt;"; when you encounter a ampersand (&) in the stream you
415  * call this method; it will scan until it reaches the matching semi colon(';') character. The word
416  * located between the '&' and ';' will be used to look up the appropriate replacement character
417  * in 'entityMap'.
418  */
419 void XmlTokenizer::ProcessAmpersand ()
420 {
421  KKStr entityName (10);
422  if (in->EndOfFile ())
423  {
424  atEndOfFile = true;
425  return;
426  }
427 
428  char ch = in->GetNextChar ();
429  while ((!in->EndOfFile ()) && (ch != ';') && (entityName.Len () < 10))
430  {
431  entityName.Append (ch);
432  ch = in->GetNextChar ();
433  }
434 
435  if (ch != ';')
436  {
437  // Name is getting too long; the ampersand is invalid; will return characters as is.
438  while (entityName.Len () > 0)
439  {
440  char ch = entityName.ExtractLastChar ();
442  }
443  }
444  else
445  {
446  char ch = LookUpEntity (entityName);
447  firstChar = ch;
448  }
449 } /* ProcessAmpersand */
450 
451 
452 
453 
454 KKStrPtr XmlTokenizer::ProcessBodyToken ()
455 {
456  KKStrPtr token = new KKStr(512);
457 
458  while ((!atEndOfFile) && (firstChar != '<') && (firstChar != '\n'))
459  {
460  if (firstChar == '&')
461  ProcessAmpersand ();
462  token->Append (firstChar);
463  GetNextChar ();
464  }
465 
466  //token->TrimRight (" \r\n");
467 
468  if ((firstChar == '\n') && (!atEndOfFile))
469  GetNextChar ();
470 
471  // At this point we are either at end-of-file, end-of-line, or the next character is "<" start of a tag field.
472  return token;
473 } /* ProcessTagToken */
474 
475 
476 
477 
478 
479 
480 
481 KKStrConstPtr XmlTokenizer::operator[](kkuint32 idx)
482 {
483  return Peek (idx);
484 } /* operator[] */
void PushTokenOnFront(KKStrPtr t)
places token at current position such that it will be the next token extracted from the stream...
KKStr(kkint32 size)
Creates a KKStr object that pre-allocates space for &#39;size&#39; characters.
Definition: KKStr.cpp:655
TokenBufferStr(const KKStr &_buff)
Definition: TokenBuffer.cpp:34
virtual void UnGetNextChar()=0
virtual bool Valid()=0
Manages the break down a stream into a set of logical tokens compatible with the XML format...
Definition: XmlTokenizer.h:25
TokenBufferStream(const KKStr &_fileName)
virtual char PeekNextChar()=0
char ExtractLastChar()
Removes the last character from the string and returns it to the caller.
Definition: KKStr.cpp:3226
XmlTokenizer(const KKStr &_fileName, bool &_fileOpened)
unsigned __int32 kkuint32
Definition: KKBaseTypes.h:89
virtual bool EndOfFile()=0
XmlTokenizer(TokenBufferPtr _in)
Constructs a XmlTokenizer using the provided [[TokenBuffer]] _in as the data stream source...
kkuint32 Len() const
Returns the number of characters in the string.
Definition: KKStr.h:366
KKTHread * KKTHreadPtr
void Append(char ch)
Definition: KKStr.cpp:1863
bool Empty() const
Definition: KKStr.h:241
static KKStr Concat(const std::vector< std::string > &values)
Concatenates the list of &#39;std::string&#39; strings.
Definition: KKStr.cpp:1082
XmlTokenizer(const KKStr &_str)
Manages the extraction of xml tokens from a KKStr instance; accomplishes this by building a [[TokenBu...
KKStrConstPtr Peek(kkuint32 idx)
Allows you to look at future tokens in the stream; index of 0 would be the next token to be extracted...
bool operator!=(const KKStr &right) const
Definition: KKStr.cpp:1558
TokenBuffer * TokenBufferPtr
Definition: TokenBuffer.h:31
KKStrList(bool owner)
Definition: KKStr.cpp:4485
KKStrConstPtr operator[](kkuint32 idx)
KKStrListPtr GetNextTokens(const KKStr &delToken)
Returns a list of tokens up to and including the first occurrence of &#39;delToken&#39;.
virtual char GetNextChar()=0
KKStrPtr GetNextToken()
Will retrieve the next token in the stream which will be either a tag token or up to one line of the ...