/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php

http://github.com/facebook/phabricator · PHP · 275 lines · 220 code · 39 blank · 16 comment · 52 complexity · cd44e8956c74dbf091c3eda5e10caadf MD5 · raw file

  1. <?php
  2. final class PhutilProseDifferenceEngine extends Phobject {
  3. public function getDiff($u, $v) {
  4. return $this->buildDiff($u, $v, 0);
  5. }
  6. private function buildDiff($u, $v, $level) {
  7. $u_parts = $this->splitCorpus($u, $level);
  8. $v_parts = $this->splitCorpus($v, $level);
  9. if ($level === 0) {
  10. $diff = $this->newHashDiff($u_parts, $v_parts);
  11. $too_large = false;
  12. } else {
  13. list($diff, $too_large) = $this->newEditDistanceMatrixDiff(
  14. $u_parts,
  15. $v_parts,
  16. $level);
  17. }
  18. $diff->reorderParts();
  19. // If we just built a character-level diff, we're all done and do not
  20. // need to go any deeper.
  21. if ($level == 3) {
  22. return $diff;
  23. }
  24. $blocks = array();
  25. $block = null;
  26. foreach ($diff->getParts() as $part) {
  27. $type = $part['type'];
  28. $text = $part['text'];
  29. switch ($type) {
  30. case '=':
  31. if ($block) {
  32. $blocks[] = $block;
  33. $block = null;
  34. }
  35. $blocks[] = array(
  36. 'type' => $type,
  37. 'text' => $text,
  38. );
  39. break;
  40. case '-':
  41. if (!$block) {
  42. $block = array(
  43. 'type' => '!',
  44. 'old' => '',
  45. 'new' => '',
  46. );
  47. }
  48. $block['old'] .= $text;
  49. break;
  50. case '+':
  51. if (!$block) {
  52. $block = array(
  53. 'type' => '!',
  54. 'old' => '',
  55. 'new' => '',
  56. );
  57. }
  58. $block['new'] .= $text;
  59. break;
  60. }
  61. }
  62. if ($block) {
  63. $blocks[] = $block;
  64. }
  65. $result = new PhutilProseDiff();
  66. foreach ($blocks as $block) {
  67. $type = $block['type'];
  68. if ($type == '=') {
  69. $result->addPart('=', $block['text']);
  70. } else {
  71. $old = $block['old'];
  72. $new = $block['new'];
  73. if (!strlen($old) && !strlen($new)) {
  74. // Nothing to do.
  75. } else if (!strlen($old)) {
  76. $result->addPart('+', $new);
  77. } else if (!strlen($new)) {
  78. $result->addPart('-', $old);
  79. } else {
  80. if ($too_large) {
  81. // If this text was too big to diff, don't try to subdivide it.
  82. $result->addPart('-', $old);
  83. $result->addPart('+', $new);
  84. } else {
  85. $subdiff = $this->buildDiff(
  86. $old,
  87. $new,
  88. $level + 1);
  89. foreach ($subdiff->getParts() as $part) {
  90. $result->addPart($part['type'], $part['text']);
  91. }
  92. }
  93. }
  94. }
  95. }
  96. $result->reorderParts();
  97. return $result;
  98. }
  99. private function splitCorpus($corpus, $level) {
  100. switch ($level) {
  101. case 0:
  102. // Level 0: Split into paragraphs.
  103. $expr = '/([\n]+)/';
  104. break;
  105. case 1:
  106. // Level 1: Split into sentences.
  107. $expr = '/([\n,!;?\.]+)/';
  108. break;
  109. case 2:
  110. // Level 2: Split into words.
  111. $expr = '/(\s+)/';
  112. break;
  113. case 3:
  114. // Level 3: Split into characters.
  115. return phutil_utf8v_combined($corpus);
  116. }
  117. $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);
  118. return $this->stitchPieces($pieces, $level);
  119. }
  120. private function stitchPieces(array $pieces, $level) {
  121. $results = array();
  122. $count = count($pieces);
  123. for ($ii = 0; $ii < $count; $ii += 2) {
  124. $result = $pieces[$ii];
  125. if ($ii + 1 < $count) {
  126. $result .= $pieces[$ii + 1];
  127. }
  128. if ($level < 2) {
  129. // Split pieces into separate text and whitespace sections: make one
  130. // piece out of all the whitespace at the beginning, one piece out of
  131. // all the actual text in the middle, and one piece out of all the
  132. // whitespace at the end.
  133. $matches = null;
  134. preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches);
  135. if (strlen($matches[1])) {
  136. $results[] = $matches[1];
  137. }
  138. if (strlen($matches[2])) {
  139. $results[] = $matches[2];
  140. }
  141. if (strlen($matches[3])) {
  142. $results[] = $matches[3];
  143. }
  144. } else {
  145. $results[] = $result;
  146. }
  147. }
  148. // If the input ended with a delimiter, we can get an empty final piece.
  149. // Just discard it.
  150. if (last($results) == '') {
  151. array_pop($results);
  152. }
  153. return $results;
  154. }
  155. private function newEditDistanceMatrixDiff(
  156. array $u_parts,
  157. array $v_parts,
  158. $level) {
  159. $matrix = id(new PhutilEditDistanceMatrix())
  160. ->setMaximumLength(128)
  161. ->setSequences($u_parts, $v_parts)
  162. ->setComputeString(true);
  163. // For word-level and character-level changes, smooth the output string
  164. // to reduce the choppiness of the diff.
  165. if ($level > 1) {
  166. $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);
  167. }
  168. $u_pos = 0;
  169. $v_pos = 0;
  170. $edits = $matrix->getEditString();
  171. $edits_length = strlen($edits);
  172. $diff = new PhutilProseDiff();
  173. for ($ii = 0; $ii < $edits_length; $ii++) {
  174. $c = $edits[$ii];
  175. if ($c == 's') {
  176. $diff->addPart('=', $u_parts[$u_pos]);
  177. $u_pos++;
  178. $v_pos++;
  179. } else if ($c == 'd') {
  180. $diff->addPart('-', $u_parts[$u_pos]);
  181. $u_pos++;
  182. } else if ($c == 'i') {
  183. $diff->addPart('+', $v_parts[$v_pos]);
  184. $v_pos++;
  185. } else if ($c == 'x') {
  186. $diff->addPart('-', $u_parts[$u_pos]);
  187. $diff->addPart('+', $v_parts[$v_pos]);
  188. $u_pos++;
  189. $v_pos++;
  190. } else {
  191. throw new Exception(
  192. pht(
  193. 'Unexpected character ("%s") in edit string.',
  194. $c));
  195. }
  196. }
  197. return array($diff, $matrix->didReachMaximumLength());
  198. }
  199. private function newHashDiff(array $u_parts, array $v_parts) {
  200. $u_ref = new PhabricatorDocumentRef();
  201. $v_ref = new PhabricatorDocumentRef();
  202. $u_blocks = $this->newDocumentEngineBlocks($u_parts);
  203. $v_blocks = $this->newDocumentEngineBlocks($v_parts);
  204. $rows = id(new PhabricatorDocumentEngineBlocks())
  205. ->addBlockList($u_ref, $u_blocks)
  206. ->addBlockList($v_ref, $v_blocks)
  207. ->newTwoUpLayout();
  208. $diff = new PhutilProseDiff();
  209. foreach ($rows as $row) {
  210. list($u_block, $v_block) = $row;
  211. if ($u_block && $v_block) {
  212. if ($u_block->getDifferenceType() === '-') {
  213. $diff->addPart('-', $u_block->getContent());
  214. $diff->addPart('+', $v_block->getContent());
  215. } else {
  216. $diff->addPart('=', $u_block->getContent());
  217. }
  218. } else if ($u_block) {
  219. $diff->addPart('-', $u_block->getContent());
  220. } else {
  221. $diff->addPart('+', $v_block->getContent());
  222. }
  223. }
  224. return $diff;
  225. }
  226. private function newDocumentEngineBlocks(array $parts) {
  227. $blocks = array();
  228. foreach ($parts as $part) {
  229. $hash = PhabricatorHash::digestForIndex($part);
  230. $blocks[] = id(new PhabricatorDocumentEngineBlock())
  231. ->setContent($part)
  232. ->setDifferenceHash($hash);
  233. }
  234. return $blocks;
  235. }
  236. }