PageRenderTime 43ms CodeModel.GetById 15ms RepoModel.GetById 0ms app.codeStats 0ms

/NRaasPacker/XmlTokenizer.cs

https://gitlab.com/forpdfsending/NRaas
C# | 351 lines | 283 code | 29 blank | 39 comment | 51 complexity | 86736a9c604732354f1c4822b485e11c MD5 | raw file
  1. ///---------------------------------------------------------------------------
  2. /// File Name: XmlTokenzier.cs
  3. /// Description: Class responsible for splitting Xml text into tokens.
  4. ///
  5. /// Author: Ali Badereddin
  6. /// Created: 26/12/2009
  7. ///---------------------------------------------------------------------------
  8. #region Using Directives
  9. using System;
  10. using System.Collections.Generic;
  11. using System.Text;
  12. using System.Text.RegularExpressions;
  13. #endregion
  14. /// <summary>
  15. /// Class responsible for splitting Xml text into tokens.
  16. /// </summary>
  17. public class XmlTokenizer
  18. {
  19. #region Public Static Methods
  20. /// <summary>
  21. /// Split the passed string into Xml tokens.
  22. /// </summary>
  23. /// <param name="str"></param>
  24. /// <returns></returns>
  25. /// <remarks>Looping char by char is more efficient than regular expressions</remarks>
  26. public static List<XmlToken> Tokenize(string str)
  27. {
  28. return LoopTokenize(str);
  29. }
  30. #endregion
  31. #region Helper Methods
  32. /// <summary>
  33. /// Iterate over characters one by one to tokenize the Xml string.
  34. /// </summary>
  35. /// <param name="str"></param>
  36. /// <returns></returns>
  37. private static List<XmlToken> LoopTokenize(string str)
  38. {
  39. // Temp variables to build up the current token
  40. List<char> currentTokenText = new List<char>();
  41. // Represents the list of tokens to be returned
  42. List<XmlToken> tokens = new List<XmlToken>();
  43. // Represents the index of the first character in the token
  44. int tokenIndex = 0;
  45. bool isStartTag = false;
  46. bool isComment = false;
  47. bool isQuote = false;
  48. bool isAttribute = false;
  49. for (int i = 0; i < str.Length; i++)
  50. {
  51. // Get the current character
  52. char c = str[i];
  53. // Skip the "ZERO WIDTH NO-BREAK SPACE" character resulting from encoding
  54. if (c == 65279)
  55. {
  56. continue;
  57. }
  58. // Handle the escape sequence case
  59. if (c == '&')
  60. {
  61. if (currentTokenText.Count > 0)
  62. {
  63. XmlToken token = new XmlToken();
  64. token.Index = tokenIndex;
  65. token.Text = new string(currentTokenText.ToArray());
  66. tokens.Add(token);
  67. currentTokenText.Clear();
  68. // Determine token type
  69. if (isQuote)
  70. {
  71. token.Type = XmlTokenType.Value;
  72. }
  73. else if (isComment)
  74. {
  75. token.Type = XmlTokenType.Comment;
  76. }
  77. else if (isStartTag)
  78. {
  79. if (isAttribute)
  80. {
  81. token.Type = XmlTokenType.Attribute;
  82. }
  83. else
  84. {
  85. token.Type = XmlTokenType.Element;
  86. }
  87. }
  88. else
  89. {
  90. token.Type = XmlTokenType.None;
  91. }
  92. }
  93. currentTokenText.Add('&');
  94. XmlToken escapeToken = new XmlToken();
  95. escapeToken.Type = XmlTokenType.Escape;
  96. escapeToken.Index = i;
  97. i++;
  98. while (i < str.Length && char.IsLetterOrDigit(str[i]))
  99. {
  100. currentTokenText.Add(str[i]);
  101. i++;
  102. }
  103. if (i < str.Length && c == ';')
  104. {
  105. currentTokenText.Add(';');
  106. i++;
  107. }
  108. escapeToken.Text = new string(currentTokenText.ToArray());
  109. currentTokenText.Clear();
  110. tokens.Add(escapeToken);
  111. continue;
  112. }
  113. // Only if the character is not between "" that is in a tag
  114. if(!isQuote)
  115. {
  116. // Only if the character is not in a comment
  117. if(!isComment)
  118. {
  119. // Only if we already have a start tag
  120. if(isStartTag)
  121. {
  122. if (char.IsLetterOrDigit(c))
  123. {
  124. // We're starting to build up a token, so save its index
  125. if (currentTokenText.Count == 0)
  126. {
  127. tokenIndex = i;
  128. }
  129. currentTokenText.Add(c);
  130. }
  131. else
  132. {
  133. // Add the previous token that could be an element or an attribute
  134. if (currentTokenText.Count > 0)
  135. {
  136. XmlToken token = new XmlToken();
  137. token.Text = new string(currentTokenText.ToArray());
  138. currentTokenText.Clear();
  139. token.Index = tokenIndex;
  140. if (isAttribute)
  141. {
  142. token.Type = XmlTokenType.Attribute;
  143. }
  144. else
  145. {
  146. token.Type = XmlTokenType.Element;
  147. }
  148. tokens.Add(token);
  149. }
  150. // Check if we have something like <!-- to flag that we have a comment
  151. if (c == '-')
  152. {
  153. if (i - 2 >= 0 && i + 1 < str.Length)
  154. {
  155. if (str[i - 2] == '<' && str[i - 1] == '!' && str[i + 1] == '-')
  156. {
  157. isStartTag = false;
  158. isComment = true;
  159. i += 1;
  160. }
  161. }
  162. }
  163. // Check if our start tag is now closed
  164. else if (c == '>')
  165. {
  166. isStartTag = false;
  167. isAttribute = false;
  168. }
  169. // We hit another start tag
  170. else if (c == '<')
  171. {
  172. isAttribute = false;
  173. }
  174. // We're starting a quote
  175. else if (c == '"')
  176. {
  177. isQuote = true;
  178. }
  179. // Check if we now have an attribute
  180. if (char.IsWhiteSpace(c))
  181. {
  182. isAttribute = true;
  183. }
  184. else
  185. {
  186. tokens.Add(new XmlToken(c.ToString(), i, XmlTokenType.SpecialChar));
  187. }
  188. }
  189. }
  190. // If we didn't have a start tag, check if we now have one
  191. else
  192. {
  193. if (c == '<')
  194. {
  195. if (currentTokenText.Count > 0)
  196. {
  197. XmlToken token = new XmlToken();
  198. token.Index = tokenIndex;
  199. token.Text = new string(currentTokenText.ToArray());
  200. token.Type = XmlTokenType.None;
  201. tokens.Add(token);
  202. currentTokenText.Clear();
  203. }
  204. isStartTag = true;
  205. tokens.Add(new XmlToken("<", i, XmlTokenType.SpecialChar));
  206. }
  207. else
  208. {
  209. if (currentTokenText.Count == 0)
  210. {
  211. tokenIndex = i;
  212. }
  213. currentTokenText.Add(c);
  214. }
  215. }
  216. }
  217. // In case we have a comment
  218. else
  219. {
  220. // We're starting to build up a token, so save its index
  221. if (currentTokenText.Count == 0)
  222. {
  223. tokenIndex = i;
  224. }
  225. currentTokenText.Add(c);
  226. // Check if we have something like --> to see if we're closing a comment
  227. // or if we're at the end
  228. if (i + 2 < str.Length)
  229. {
  230. if (c == '-')
  231. {
  232. if (str[i + 1] == '-' && str[i + 2] == '>')
  233. {
  234. isComment = false;
  235. i += 2;
  236. }
  237. }
  238. }
  239. else
  240. {
  241. isComment = false;
  242. }
  243. if (!isComment)
  244. {
  245. XmlToken token = new XmlToken();
  246. token.Type = XmlTokenType.Comment;
  247. token.Index = tokenIndex;
  248. token.Text = new string(currentTokenText.ToArray());
  249. tokens.Add(token);
  250. currentTokenText.Clear();
  251. }
  252. }
  253. }
  254. // In case we have a quote
  255. else
  256. {
  257. // We're starting to build up a token, so save its index
  258. if (currentTokenText.Count == 0)
  259. {
  260. tokenIndex = i;
  261. }
  262. // Check if we no longer have a quote
  263. if (c == '"')
  264. {
  265. isQuote = false;
  266. XmlToken token = new XmlToken();
  267. token.Type = XmlTokenType.Value;
  268. token.Index = tokenIndex;
  269. token.Text = new string(currentTokenText.ToArray());
  270. tokens.Add(token);
  271. currentTokenText.Clear();
  272. }
  273. else
  274. {
  275. currentTokenText.Add(c);
  276. }
  277. }
  278. }
  279. // Handle the last element
  280. if (currentTokenText.Count > 0)
  281. {
  282. XmlToken token = new XmlToken();
  283. token.Index = tokenIndex;
  284. token.Text = new string(currentTokenText.ToArray());
  285. tokens.Add(token);
  286. currentTokenText.Clear();
  287. // Determine token type
  288. if (isQuote)
  289. {
  290. token.Type = XmlTokenType.Value;
  291. }
  292. else if (isComment)
  293. {
  294. token.Type = XmlTokenType.Comment;
  295. }
  296. else if (isStartTag)
  297. {
  298. if (isAttribute)
  299. {
  300. token.Type = XmlTokenType.Attribute;
  301. }
  302. else
  303. {
  304. token.Type = XmlTokenType.Element;
  305. }
  306. }
  307. else
  308. {
  309. token.Type = XmlTokenType.None;
  310. }
  311. }
  312. return tokens;
  313. }
  314. #endregion
  315. }