/src/Document.php

https://github.com/henck/rtf-html-php · PHP · 336 lines · 219 code · 38 blank · 79 comment · 62 complexity · c6284a775f97a0bb510ba444d489eda7 MD5 · raw file

  1. <?php
  2. declare(strict_types=1);
  3. namespace RtfHtmlPhp;
  4. class Document
  5. {
  6. private $rtf; // RTF string being parsed
  7. private $pos; // Current position in RTF string
  8. private $len; // Length of RTF string
  9. public $root = null; // Root group
  10. private $group; // Current RTF group
  11. public function __construct($rtf)
  12. {
  13. $this->Parse($rtf);
  14. }
  15. // Get the next character from the RTF stream.
  16. // Parsing is aborted when reading beyond end of input string.
  17. protected function GetChar()
  18. {
  19. $this->char = null;
  20. if ($this->pos < strlen($this->rtf)) {
  21. $this->char = $this->rtf[$this->pos++];
  22. } else {
  23. $err = "Parse error: Tried to read past end of input; RTF is probably truncated.";
  24. trigger_error($err);
  25. throw new \Exception($err);
  26. }
  27. }
  28. /*
  29. * (Helper method)
  30. * Is the current character a letter?
  31. */
  32. protected function is_letter(): bool
  33. {
  34. if(ord($this->char) >= 65 && ord($this->char) <= 90) return true;
  35. if(ord($this->char) >= 97 && ord($this->char) <= 122) return true;
  36. return false;
  37. }
  38. /*
  39. * (Helper method)
  40. * Is the current character a digit?
  41. */
  42. protected function is_digit(): bool
  43. {
  44. return (ord($this->char) >= 48 && ord($this->char) <= 57);
  45. }
  46. /*
  47. * (Helper method)
  48. * Is the current character end-of-line (EOL)?
  49. */
  50. protected function is_endofline()
  51. {
  52. if ($this->char == "\r" || $this->char == "\n") {
  53. // Checks for a Windows/Acron type EOL
  54. if( $this->rtf[$this->pos] == "\n" || $this->rtf[$this->pos] == "\r" ) {
  55. $this->GetChar();
  56. }
  57. return true;
  58. }
  59. return false;
  60. }
  61. /*
  62. * (Helper method)
  63. * Is the current character for a space delimiter?
  64. */
  65. protected function is_space_delimiter()
  66. {
  67. return ($this->char == " " || $this->is_endofline());
  68. }
  69. // Store state of document on stack.
  70. protected function ParseStartGroup()
  71. {
  72. $group = new Group();
  73. // Is there a current group? Then make the new group its child:
  74. if($this->group != null) {
  75. $group->parent = $this->group;
  76. array_push($this->group->children, $group);
  77. array_push($this->uc, end($this->uc));
  78. }
  79. // If there is no parent group, then set this group
  80. // as the root group.
  81. else {
  82. $this->root = $group;
  83. // Create uc stack and insert the first default value
  84. $this->uc = array(1);
  85. }
  86. // Set the new group as the current group:
  87. $this->group = $group;
  88. }
  89. // Retrieve state of document from stack.
  90. protected function ParseEndGroup()
  91. {
  92. $this->group = $this->group->parent;
  93. // Retrieve last uc value from stack
  94. array_pop($this->uc);
  95. }
  96. protected function ParseControlWord()
  97. {
  98. // Read letters until a non-letter is reached.
  99. $word = "";
  100. $this->GetChar();
  101. while($this->is_letter())
  102. {
  103. $word .= $this->char;
  104. $this->GetChar();
  105. }
  106. // Read parameter (if any) consisting of digits.
  107. // Parameter may be negative, i.e., starting with a '-'
  108. $parameter = null;
  109. $negative = false;
  110. if($this->char == '-') {
  111. $this->GetChar();
  112. $negative = true;
  113. }
  114. while($this->is_digit())
  115. {
  116. if($parameter == null) $parameter = 0;
  117. $parameter = $parameter * 10 + $this->char;
  118. $this->GetChar();
  119. }
  120. // If no parameter present, assume control word's default (usually 1)
  121. // If no default then assign 0 to the parameter
  122. if($parameter === null) $parameter = 1;
  123. // Convert parameter to a negative number when applicable
  124. if($negative) $parameter = -$parameter;
  125. // Update uc value
  126. if ($word == "uc") {
  127. array_pop($this->uc);
  128. $this->uc[] = $parameter;
  129. }
  130. // Skip space delimiter
  131. if(!$this->is_space_delimiter()) $this->pos--;
  132. // If this is \u, then the parameter will be followed
  133. // by {$this->uc} characters.
  134. if($word == "u") {
  135. // Convert parameter to unsigned decimal unicode
  136. if($negative) $parameter = 65536 + $parameter;
  137. // Will ignore replacement characters $uc times
  138. $uc = end($this->uc);
  139. while ($uc > 0) {
  140. $this->GetChar();
  141. // If the replacement character is encoded as
  142. // hexadecimal value \'hh then jump over it
  143. if($this->char == '\\' && $this->rtf[$this->pos]=='\'')
  144. $this->pos = $this->pos + 3;
  145. // Break if it's an RTF scope delimiter
  146. elseif ($this->char == '{' || $this->char == '{')
  147. break;
  148. // - To include an RTF delimiter in skippable data, it must be
  149. // represented using the appropriate control symbol (that is,
  150. // escaped with a backslash,) as in plain text.
  151. //
  152. // - Any RTF control word or symbol is considered a single character
  153. // for the purposes of counting skippable characters. For this reason
  154. // it's more appropriate to create a $skip flag and let the Parse()
  155. // function take care of the skippable characters.
  156. $uc--;
  157. }
  158. }
  159. // Add new RTF word as a child to the current group.
  160. $rtfword = new ControlWord();
  161. $rtfword->word = $word;
  162. $rtfword->parameter = $parameter;
  163. array_push($this->group->children, $rtfword);
  164. }
  165. protected function ParseControlSymbol()
  166. {
  167. // Read symbol (one character only).
  168. $this->GetChar();
  169. $symbol = $this->char;
  170. // Exceptional case:
  171. // Treat EOL symbols as \par control word
  172. if ($this->is_endofline()) {
  173. $rtfword = new ControlWord();
  174. $rtfword->word = 'par';
  175. $rtfword->parameter = 0;
  176. array_push($this->group->children, $rtfword);
  177. return;
  178. }
  179. // Symbols ordinarily have no parameter. However,
  180. // if this is \' (a single quote), then it is
  181. // followed by a 2-digit hex-code:
  182. $parameter = 0;
  183. if ($symbol == '\'') {
  184. $this->GetChar();
  185. $parameter = $this->char;
  186. $this->GetChar();
  187. $parameter = hexdec($parameter . $this->char);
  188. }
  189. // Add new control symbol as a child to the current group:
  190. $rtfsymbol = new ControlSymbol();
  191. $rtfsymbol->symbol = $symbol;
  192. $rtfsymbol->parameter = $parameter;
  193. array_push($this->group->children, $rtfsymbol);
  194. }
  195. protected function ParseControl()
  196. {
  197. // Beginning of an RTF control word or control symbol.
  198. // Look ahead by one character to see if it starts with
  199. // a letter (control world) or another symbol (control symbol):
  200. $this->GetChar();
  201. $this->pos--; // (go back after look-ahead)
  202. if($this->is_letter()) {
  203. $this->ParseControlWord();
  204. } else {
  205. $this->ParseControlSymbol();
  206. }
  207. }
  208. protected function ParseText()
  209. {
  210. // Parse plain text up to backslash or brace,
  211. // unless escaped.
  212. $text = "";
  213. $terminate = false;
  214. do
  215. {
  216. // Ignore EOL characters
  217. if($this->char == "\r" || $this->char == "\n") {
  218. $this->GetChar();
  219. continue;
  220. }
  221. // Is this an escape?
  222. if($this->char == '\\') {
  223. // Perform lookahead to see if this
  224. // is really an escape sequence.
  225. $this->GetChar();
  226. switch($this->char)
  227. {
  228. case '\\': break;
  229. case '{': break;
  230. case '}': break;
  231. default:
  232. // Not an escape. Roll back.
  233. $this->pos = $this->pos - 2;
  234. $terminate = true;
  235. break;
  236. }
  237. } elseif ($this->char == '{' || $this->char == '}') {
  238. $this->pos--;
  239. $terminate = true;
  240. }
  241. if(!$terminate) {
  242. // Save plain text
  243. $text .= $this->char;
  244. $this->GetChar();
  245. }
  246. }
  247. while(!$terminate && $this->pos < $this->len);
  248. // Create new Text element:
  249. $text = new Text($text);
  250. // If there is no current group, then this is not a valid RTF file.
  251. // Throw an exception.
  252. if($this->group == null) {
  253. $err = "Parse error: RTF text outside of group.";
  254. trigger_error($err);
  255. throw new \Exception($err);
  256. }
  257. // Add text as a child to the current group:
  258. array_push($this->group->children, $text);
  259. }
  260. /*
  261. * Attempt to parse an RTF string.
  262. */
  263. protected function Parse(string $rtf)
  264. {
  265. $this->rtf = $rtf;
  266. $this->pos = 0;
  267. $this->len = strlen($this->rtf);
  268. $this->group = null;
  269. $this->root = null;
  270. while($this->pos < $this->len-1)
  271. {
  272. // Read next character:
  273. $this->GetChar();
  274. // Ignore \r and \n
  275. if($this->char == "\n" || $this->char == "\r") continue;
  276. // What type of character is this?
  277. switch($this->char)
  278. {
  279. case '{':
  280. $this->ParseStartGroup();
  281. break;
  282. case '}':
  283. $this->ParseEndGroup();
  284. break;
  285. case '\\':
  286. $this->ParseControl();
  287. break;
  288. default:
  289. $this->ParseText();
  290. break;
  291. }
  292. }
  293. }
  294. public function __toString() {
  295. if(!$this->root) return "No root group";
  296. return $this->root->toString();
  297. }
  298. }