PageRenderTime 5636ms CodeModel.GetById 13ms RepoModel.GetById 1ms app.codeStats 0ms

/Candidate/UniscribeTest/DocxContentConverter/Program.cs

#
C# | 259 lines | 246 code | 13 blank | 0 comment | 21 complexity | 61920cf02c606d4017d3b2c32f4c2c71 MD5 | raw file
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Threading.Tasks;
  7. using System.Xml.Linq;
  8. namespace DocxContentConverter
  9. {
  10. class Program
  11. {
  12. class FontInfo
  13. {
  14. public string Name { get; set; }
  15. public bool Bold { get; set; }
  16. public string Color { get; set; }
  17. public string Size { get; set; }
  18. public override string ToString()
  19. {
  20. return Name + "#" + Bold + "#" + Color + "#" + Size;
  21. }
  22. }
  23. static IEnumerable<XElement> RecursiveElements(XElement parent, XName name)
  24. {
  25. if (parent.Name == name)
  26. {
  27. yield return parent;
  28. yield break;
  29. }
  30. foreach (var e in parent.Elements())
  31. {
  32. if (e.Name == name)
  33. {
  34. yield return e;
  35. }
  36. else
  37. {
  38. foreach (var se in RecursiveElements(e, name))
  39. {
  40. yield return se;
  41. }
  42. }
  43. }
  44. }
  45. const string fo = "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
  46. const string office = "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
  47. const string style = "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
  48. const string table = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
  49. const string text = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
  50. static Tuple<string, string, string> GetTextItem(string paragraphStyle, string textStyle, string text)
  51. {
  52. if (paragraphStyle == null || textStyle == null) throw new ArgumentException();
  53. return Tuple.Create(paragraphStyle, textStyle, text);
  54. }
  55. static IEnumerable<Tuple<string, string, string>> ExtractText(XElement paragraphParent, string paragraphStyle, string textStyle, int listIndex)
  56. {
  57. foreach (XNode node in paragraphParent.Nodes())
  58. {
  59. if (node is XElement)
  60. {
  61. XElement element = node as XElement;
  62. switch (element.Name.ToString())
  63. {
  64. case "{" + text + "}p":
  65. {
  66. var styleName = element.Attribute(XName.Get("style-name", text)).Value;
  67. foreach (var p in ExtractText(element, styleName, styleName, listIndex))
  68. {
  69. yield return p;
  70. }
  71. }
  72. break;
  73. case "{" + text + "}span":
  74. {
  75. var styleName = element.Attribute(XName.Get("style-name", text)).Value;
  76. foreach (var p in ExtractText(element, paragraphStyle, styleName, listIndex))
  77. {
  78. yield return p;
  79. }
  80. }
  81. break;
  82. case "{" + text + "}line-break":
  83. yield return GetTextItem(paragraphStyle, textStyle, "\r\n");
  84. break;
  85. case "{" + text + "}s":
  86. yield return GetTextItem(paragraphStyle, textStyle, " ");
  87. break;
  88. case "{" + text + "}tab":
  89. yield return GetTextItem(paragraphStyle, textStyle, "\t");
  90. break;
  91. case "{" + text + "}a":
  92. case "{" + table + "}table":
  93. case "{" + table + "}table-row":
  94. case "{" + table + "}table-cell":
  95. foreach (var p in ExtractText(element, paragraphStyle, textStyle, listIndex))
  96. {
  97. yield return p;
  98. }
  99. break;
  100. case "{" + text + "}list":
  101. {
  102. listIndex = 0;
  103. foreach (XElement item in element.Elements(XName.Get("list-item", text)))
  104. {
  105. foreach (var p in ExtractText(item, paragraphStyle, textStyle, ++listIndex))
  106. {
  107. yield return p;
  108. }
  109. }
  110. }
  111. break;
  112. case "{" + text + "}list-item":
  113. listIndex += 1;
  114. foreach (var p in ExtractText(element, paragraphStyle, textStyle, listIndex))
  115. {
  116. yield return p;
  117. }
  118. break;
  119. case "{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}custom-shape":
  120. case "{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}frame":
  121. case "{" + text + "}soft-page-break":
  122. case "{" + text + "}bookmark-start":
  123. case "{" + text + "}bookmark-end":
  124. case "{" + table + "}table-columns":
  125. break;
  126. default:
  127. throw new ArgumentException();
  128. }
  129. }
  130. else if (node is XText)
  131. {
  132. yield return GetTextItem(paragraphStyle, textStyle, node.ToString().Trim());
  133. }
  134. else
  135. {
  136. throw new ArgumentException();
  137. }
  138. }
  139. }
  140. static void Convert(string from, string to)
  141. {
  142. XDocument document = XDocument.Load(from);
  143. var styles = document
  144. .Root
  145. .Element(XName.Get("automatic-styles", office))
  146. .Elements(XName.Get("style", style))
  147. .Select(s =>
  148. Tuple.Create(
  149. s.Attribute(XName.Get("name", style)).Value,
  150. s.Element(XName.Get("text-properties", style))
  151. )
  152. )
  153. .Where(t => t.Item2 != null)
  154. .ToDictionary(
  155. t => t.Item1,
  156. t => new FontInfo
  157. {
  158. Name = t.Item2.Attribute(XName.Get("font-name", style)).Value,
  159. Bold = t.Item2.Attribute(XName.Get("font-weight", fo)) != null && t.Item2.Attribute(XName.Get("font-weight", fo)).Value == "bold",
  160. Color = t.Item2.Attribute(XName.Get("color", fo)) != null ? t.Item2.Attribute(XName.Get("color", fo)).Value : "#000000",
  161. Size = t.Item2.Attribute(XName.Get("font-size", fo)).Value,
  162. });
  163. var paragraphParent = document
  164. .Root
  165. .Element(XName.Get("body", office))
  166. .Element(XName.Get("text", office));
  167. var spans = ExtractText(paragraphParent, null, null, 0)
  168. .GroupBy(s => s.Item1)
  169. .Select(g =>
  170. Tuple.Create(
  171. g.Key,
  172. g.Select(t => Tuple.Create(t.Item2, t.Item3)).ToArray()
  173. )
  174. )
  175. .ToArray();
  176. XDocument output = new XDocument();
  177. output.Add(
  178. new XElement("document",
  179. spans.Select(p =>
  180. new XElement(
  181. "p",
  182. p.Item2
  183. .Aggregate(new Tuple<string, string>[] { }, (a, b) =>
  184. {
  185. if (a.Length == 0)
  186. {
  187. return new Tuple<string, string>[] { b };
  188. }
  189. else
  190. {
  191. var last = a.Last();
  192. var fontInfo1 = styles[last.Item1];
  193. var fontInfo2 = styles[b.Item1];
  194. if (fontInfo1.ToString() == fontInfo2.ToString())
  195. {
  196. a[a.Length - 1] = Tuple.Create(last.Item1, last.Item2 + b.Item2);
  197. return a;
  198. }
  199. else
  200. {
  201. return a.Concat(new Tuple<string, string>[] { b }).ToArray();
  202. }
  203. }
  204. })
  205. .Select(s =>
  206. {
  207. var fontInfo = styles[s.Item1];
  208. return new XElement(
  209. "s",
  210. new XAttribute("font", fontInfo.Name),
  211. new XAttribute("bold", fontInfo.Bold),
  212. new XAttribute("color", fontInfo.Color),
  213. new XAttribute("size", fontInfo.Size),
  214. s.Item2
  215. );
  216. })
  217. )
  218. )
  219. )
  220. );
  221. using (StreamWriter writer = new StreamWriter(to))
  222. {
  223. foreach (var p in output.Root.Elements("p"))
  224. {
  225. foreach (var s in p.Elements("s"))
  226. {
  227. writer.WriteLine(
  228. "<s>{0}:{1}:{2}:{3}:{4}</s>",
  229. s.Attribute("font").Value,
  230. s.Attribute("bold").Value,
  231. s.Attribute("color").Value,
  232. s.Attribute("size").Value,
  233. s.Value
  234. );
  235. }
  236. writer.WriteLine("<p/>");
  237. }
  238. }
  239. }
  240. static void Main(string[] args)
  241. {
  242. Convert("content.xml", "document.txt");
  243. Convert("content2.xml", "document2.txt");
  244. }
  245. }
  246. }