/AvalonEdit/ICSharpCode.AvalonEdit/Document/TextUtilities.cs

http://github.com/icsharpcode/ILSpy · C# · 422 lines · 266 code · 20 blank · 136 comment · 109 complexity · cd9dfe6f1c63a99181d2a94498ba3b89 MD5 · raw file

  1. // Copyright (c) 2014 AlphaSierraPapa for the SharpDevelop Team
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy of this
  4. // software and associated documentation files (the "Software"), to deal in the Software
  5. // without restriction, including without limitation the rights to use, copy, modify, merge,
  6. // publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
  7. // to whom the Software is furnished to do so, subject to the following conditions:
  8. //
  9. // The above copyright notice and this permission notice shall be included in all copies or
  10. // substantial portions of the Software.
  11. //
  12. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
  13. // INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  14. // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
  15. // FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  16. // OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  17. // DEALINGS IN THE SOFTWARE.
  18. using System;
  19. using System.Globalization;
  20. using System.Windows.Documents;
  21. #if NREFACTORY
  22. using ICSharpCode.NRefactory.Editor;
  23. #endif
  24. namespace ICSharpCode.AvalonEdit.Document
  25. {
  26. /// <summary>
  27. /// Specifies the mode for getting the next caret position.
  28. /// </summary>
  29. public enum CaretPositioningMode
  30. {
  31. /// <summary>
  32. /// Normal positioning (stop after every grapheme)
  33. /// </summary>
  34. Normal,
  35. /// <summary>
  36. /// Stop only on word borders.
  37. /// </summary>
  38. WordBorder,
  39. /// <summary>
  40. /// Stop only at the beginning of words. This is used for Ctrl+Left/Ctrl+Right.
  41. /// </summary>
  42. WordStart,
  43. /// <summary>
  44. /// Stop only at the beginning of words, and anywhere in the middle of symbols.
  45. /// </summary>
  46. WordStartOrSymbol,
  47. /// <summary>
  48. /// Stop only on word borders, and anywhere in the middle of symbols.
  49. /// </summary>
  50. WordBorderOrSymbol,
  51. /// <summary>
  52. /// Stop between every Unicode codepoint, even within the same grapheme.
  53. /// This is used to implement deleting the previous grapheme when Backspace is pressed.
  54. /// </summary>
  55. EveryCodepoint
  56. }
  57. /// <summary>
  58. /// Static helper methods for working with text.
  59. /// </summary>
  60. public static partial class TextUtilities
  61. {
  62. #region GetControlCharacterName
  63. // the names of the first 32 ASCII characters = Unicode C0 block
  64. static readonly string[] c0Table = {
  65. "NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT",
  66. "LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3",
  67. "DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS",
  68. "RS", "US"
  69. };
  70. // DEL (ASCII 127) and
  71. // the names of the control characters in the C1 block (Unicode 128 to 159)
  72. static readonly string[] delAndC1Table = {
  73. "DEL",
  74. "PAD", "HOP", "BPH", "NBH", "IND", "NEL", "SSA", "ESA", "HTS", "HTJ",
  75. "VTS", "PLD", "PLU", "RI", "SS2", "SS3", "DCS", "PU1", "PU2", "STS",
  76. "CCH", "MW", "SPA", "EPA", "SOS", "SGCI", "SCI", "CSI", "ST", "OSC",
  77. "PM", "APC"
  78. };
  79. /// <summary>
  80. /// Gets the name of the control character.
  81. /// For unknown characters, the unicode codepoint is returned as 4-digit hexadecimal value.
  82. /// </summary>
  83. public static string GetControlCharacterName(char controlCharacter)
  84. {
  85. int num = (int)controlCharacter;
  86. if (num < c0Table.Length)
  87. return c0Table[num];
  88. else if (num >= 127 && num <= 159)
  89. return delAndC1Table[num - 127];
  90. else
  91. return num.ToString("x4", CultureInfo.InvariantCulture);
  92. }
  93. #endregion
  94. #region GetWhitespace
  95. /// <summary>
  96. /// Gets all whitespace (' ' and '\t', but no newlines) after offset.
  97. /// </summary>
  98. /// <param name="textSource">The text source.</param>
  99. /// <param name="offset">The offset where the whitespace starts.</param>
  100. /// <returns>The segment containing the whitespace.</returns>
  101. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
  102. Justification = "WPF uses 'Whitespace'")]
  103. public static ISegment GetWhitespaceAfter(ITextSource textSource, int offset)
  104. {
  105. if (textSource == null)
  106. throw new ArgumentNullException("textSource");
  107. int pos;
  108. for (pos = offset; pos < textSource.TextLength; pos++) {
  109. char c = textSource.GetCharAt(pos);
  110. if (c != ' ' && c != '\t')
  111. break;
  112. }
  113. return new SimpleSegment(offset, pos - offset);
  114. }
  115. /// <summary>
  116. /// Gets all whitespace (' ' and '\t', but no newlines) before offset.
  117. /// </summary>
  118. /// <param name="textSource">The text source.</param>
  119. /// <param name="offset">The offset where the whitespace ends.</param>
  120. /// <returns>The segment containing the whitespace.</returns>
  121. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
  122. Justification = "WPF uses 'Whitespace'")]
  123. public static ISegment GetWhitespaceBefore(ITextSource textSource, int offset)
  124. {
  125. if (textSource == null)
  126. throw new ArgumentNullException("textSource");
  127. int pos;
  128. for (pos = offset - 1; pos >= 0; pos--) {
  129. char c = textSource.GetCharAt(pos);
  130. if (c != ' ' && c != '\t')
  131. break;
  132. }
  133. pos++; // go back the one character that isn't whitespace
  134. return new SimpleSegment(pos, offset - pos);
  135. }
  136. /// <summary>
  137. /// Gets the leading whitespace segment on the document line.
  138. /// </summary>
  139. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
  140. Justification = "WPF uses 'Whitespace'")]
  141. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1011:ConsiderPassingBaseTypesAsParameters",
  142. Justification = "Parameter cannot be ITextSource because it must belong to the DocumentLine")]
  143. public static ISegment GetLeadingWhitespace(TextDocument document, DocumentLine documentLine)
  144. {
  145. if (documentLine == null)
  146. throw new ArgumentNullException("documentLine");
  147. return GetWhitespaceAfter(document, documentLine.Offset);
  148. }
  149. /// <summary>
  150. /// Gets the trailing whitespace segment on the document line.
  151. /// </summary>
  152. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
  153. Justification = "WPF uses 'Whitespace'")]
  154. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1011:ConsiderPassingBaseTypesAsParameters",
  155. Justification = "Parameter cannot be ITextSource because it must belong to the DocumentLine")]
  156. public static ISegment GetTrailingWhitespace(TextDocument document, DocumentLine documentLine)
  157. {
  158. if (documentLine == null)
  159. throw new ArgumentNullException("documentLine");
  160. ISegment segment = GetWhitespaceBefore(document, documentLine.EndOffset);
  161. // If the whole line consists of whitespace, we consider all of it as leading whitespace,
  162. // so return an empty segment as trailing whitespace.
  163. if (segment.Offset == documentLine.Offset)
  164. return new SimpleSegment(documentLine.EndOffset, 0);
  165. else
  166. return segment;
  167. }
  168. #endregion
  169. #region GetSingleIndentationSegment
  170. /// <summary>
  171. /// Gets a single indentation segment starting at <paramref name="offset"/> - at most one tab
  172. /// or <paramref name="indentationSize"/> spaces.
  173. /// </summary>
  174. /// <param name="textSource">The text source.</param>
  175. /// <param name="offset">The offset where the indentation segment starts.</param>
  176. /// <param name="indentationSize">The size of an indentation unit. See <see cref="TextEditorOptions.IndentationSize"/>.</param>
  177. /// <returns>The indentation segment.
  178. /// If there is no indentation character at the specified <paramref name="offset"/>,
  179. /// an empty segment is returned.</returns>
  180. public static ISegment GetSingleIndentationSegment(ITextSource textSource, int offset, int indentationSize)
  181. {
  182. if (textSource == null)
  183. throw new ArgumentNullException("textSource");
  184. int pos = offset;
  185. while (pos < textSource.TextLength) {
  186. char c = textSource.GetCharAt(pos);
  187. if (c == '\t') {
  188. if (pos == offset)
  189. return new SimpleSegment(offset, 1);
  190. else
  191. break;
  192. } else if (c == ' ') {
  193. if (pos - offset >= indentationSize)
  194. break;
  195. } else {
  196. break;
  197. }
  198. // continue only if c==' ' and (pos-offset)<tabSize
  199. pos++;
  200. }
  201. return new SimpleSegment(offset, pos - offset);
  202. }
  203. #endregion
  204. #region GetCharacterClass
  205. /// <summary>
  206. /// Gets whether the character is whitespace, part of an identifier, or line terminator.
  207. /// </summary>
  208. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "c")]
  209. public static CharacterClass GetCharacterClass(char c)
  210. {
  211. if (c == '\r' || c == '\n')
  212. return CharacterClass.LineTerminator;
  213. if (c == '_')
  214. return CharacterClass.IdentifierPart;
  215. return GetCharacterClass(char.GetUnicodeCategory(c));
  216. }
  217. static CharacterClass GetCharacterClass(char highSurrogate, char lowSurrogate)
  218. {
  219. if (char.IsSurrogatePair(highSurrogate, lowSurrogate)) {
  220. return GetCharacterClass(char.GetUnicodeCategory(highSurrogate.ToString() + lowSurrogate.ToString(), 0));
  221. } else {
  222. // malformed surrogate pair
  223. return CharacterClass.Other;
  224. }
  225. }
  226. static CharacterClass GetCharacterClass(UnicodeCategory c)
  227. {
  228. switch (c) {
  229. case UnicodeCategory.SpaceSeparator:
  230. case UnicodeCategory.LineSeparator:
  231. case UnicodeCategory.ParagraphSeparator:
  232. case UnicodeCategory.Control:
  233. return CharacterClass.Whitespace;
  234. case UnicodeCategory.UppercaseLetter:
  235. case UnicodeCategory.LowercaseLetter:
  236. case UnicodeCategory.TitlecaseLetter:
  237. case UnicodeCategory.ModifierLetter:
  238. case UnicodeCategory.OtherLetter:
  239. case UnicodeCategory.DecimalDigitNumber:
  240. return CharacterClass.IdentifierPart;
  241. case UnicodeCategory.NonSpacingMark:
  242. case UnicodeCategory.SpacingCombiningMark:
  243. case UnicodeCategory.EnclosingMark:
  244. return CharacterClass.CombiningMark;
  245. default:
  246. return CharacterClass.Other;
  247. }
  248. }
  249. #endregion
  250. #region GetNextCaretPosition
  251. /// <summary>
  252. /// Gets the next caret position.
  253. /// </summary>
  254. /// <param name="textSource">The text source.</param>
  255. /// <param name="offset">The start offset inside the text source.</param>
  256. /// <param name="direction">The search direction (forwards or backwards).</param>
  257. /// <param name="mode">The mode for caret positioning.</param>
  258. /// <returns>The offset of the next caret position, or -1 if there is no further caret position
  259. /// in the text source.</returns>
  260. /// <remarks>
  261. /// This method is NOT equivalent to the actual caret movement when using VisualLine.GetNextCaretPosition.
  262. /// In real caret movement, there are additional caret stops at line starts and ends. This method
  263. /// treats linefeeds as simple whitespace.
  264. /// </remarks>
  265. public static int GetNextCaretPosition(ITextSource textSource, int offset, LogicalDirection direction, CaretPositioningMode mode)
  266. {
  267. if (textSource == null)
  268. throw new ArgumentNullException("textSource");
  269. switch (mode) {
  270. case CaretPositioningMode.Normal:
  271. case CaretPositioningMode.EveryCodepoint:
  272. case CaretPositioningMode.WordBorder:
  273. case CaretPositioningMode.WordBorderOrSymbol:
  274. case CaretPositioningMode.WordStart:
  275. case CaretPositioningMode.WordStartOrSymbol:
  276. break; // OK
  277. default:
  278. throw new ArgumentException("Unsupported CaretPositioningMode: " + mode, "mode");
  279. }
  280. if (direction != LogicalDirection.Backward
  281. && direction != LogicalDirection.Forward)
  282. {
  283. throw new ArgumentException("Invalid LogicalDirection: " + direction, "direction");
  284. }
  285. int textLength = textSource.TextLength;
  286. if (textLength <= 0) {
  287. // empty document? has a normal caret position at 0, though no word borders
  288. if (IsNormal(mode)) {
  289. if (offset > 0 && direction == LogicalDirection.Backward) return 0;
  290. if (offset < 0 && direction == LogicalDirection.Forward) return 0;
  291. }
  292. return -1;
  293. }
  294. while (true) {
  295. int nextPos = (direction == LogicalDirection.Backward) ? offset - 1 : offset + 1;
  296. // return -1 if there is no further caret position in the text source
  297. // we also need this to handle offset values outside the valid range
  298. if (nextPos < 0 || nextPos > textLength)
  299. return -1;
  300. // check if we've run against the textSource borders.
  301. // a 'textSource' usually isn't the whole document, but a single VisualLineElement.
  302. if (nextPos == 0) {
  303. // at the document start, there's only a word border
  304. // if the first character is not whitespace
  305. if (IsNormal(mode) || !char.IsWhiteSpace(textSource.GetCharAt(0)))
  306. return nextPos;
  307. } else if (nextPos == textLength) {
  308. // at the document end, there's never a word start
  309. if (mode != CaretPositioningMode.WordStart && mode != CaretPositioningMode.WordStartOrSymbol) {
  310. // at the document end, there's only a word border
  311. // if the last character is not whitespace
  312. if (IsNormal(mode) || !char.IsWhiteSpace(textSource.GetCharAt(textLength - 1)))
  313. return nextPos;
  314. }
  315. } else {
  316. char charBefore = textSource.GetCharAt(nextPos - 1);
  317. char charAfter = textSource.GetCharAt(nextPos);
  318. // Don't stop in the middle of a surrogate pair
  319. if (!char.IsSurrogatePair(charBefore, charAfter)) {
  320. CharacterClass classBefore = GetCharacterClass(charBefore);
  321. CharacterClass classAfter = GetCharacterClass(charAfter);
  322. // get correct class for characters outside BMP:
  323. if (char.IsLowSurrogate(charBefore) && nextPos >= 2) {
  324. classBefore = GetCharacterClass(textSource.GetCharAt(nextPos - 2), charBefore);
  325. }
  326. if (char.IsHighSurrogate(charAfter) && nextPos + 1 < textLength) {
  327. classAfter = GetCharacterClass(charAfter, textSource.GetCharAt(nextPos + 1));
  328. }
  329. if (StopBetweenCharacters(mode, classBefore, classAfter)) {
  330. return nextPos;
  331. }
  332. }
  333. }
  334. // we'll have to continue searching...
  335. offset = nextPos;
  336. }
  337. }
  338. static bool IsNormal(CaretPositioningMode mode)
  339. {
  340. return mode == CaretPositioningMode.Normal || mode == CaretPositioningMode.EveryCodepoint;
  341. }
  342. static bool StopBetweenCharacters(CaretPositioningMode mode, CharacterClass charBefore, CharacterClass charAfter)
  343. {
  344. if (mode == CaretPositioningMode.EveryCodepoint)
  345. return true;
  346. // Don't stop in the middle of a grapheme
  347. if (charAfter == CharacterClass.CombiningMark)
  348. return false;
  349. // Stop after every grapheme in normal mode
  350. if (mode == CaretPositioningMode.Normal)
  351. return true;
  352. if (charBefore == charAfter) {
  353. if (charBefore == CharacterClass.Other &&
  354. (mode == CaretPositioningMode.WordBorderOrSymbol || mode == CaretPositioningMode.WordStartOrSymbol))
  355. {
  356. // With the "OrSymbol" modes, there's a word border and start between any two unknown characters
  357. return true;
  358. }
  359. } else {
  360. // this looks like a possible border
  361. // if we're looking for word starts, check that this is a word start (and not a word end)
  362. // if we're just checking for word borders, accept unconditionally
  363. if (!((mode == CaretPositioningMode.WordStart || mode == CaretPositioningMode.WordStartOrSymbol)
  364. && (charAfter == CharacterClass.Whitespace || charAfter == CharacterClass.LineTerminator)))
  365. {
  366. return true;
  367. }
  368. }
  369. return false;
  370. }
  371. #endregion
  372. }
  373. /// <summary>
  374. /// Classifies a character as whitespace, line terminator, part of an identifier, or other.
  375. /// </summary>
  376. public enum CharacterClass
  377. {
  378. /// <summary>
  379. /// The character is not whitespace, line terminator or part of an identifier.
  380. /// </summary>
  381. Other,
  382. /// <summary>
  383. /// The character is whitespace (but not line terminator).
  384. /// </summary>
  385. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
  386. Justification = "WPF uses 'Whitespace'")]
  387. Whitespace,
  388. /// <summary>
  389. /// The character can be part of an identifier (Letter, digit or underscore).
  390. /// </summary>
  391. IdentifierPart,
  392. /// <summary>
  393. /// The character is line terminator (\r or \n).
  394. /// </summary>
  395. LineTerminator,
  396. /// <summary>
  397. /// The character is a unicode combining mark that modifies the previous character.
  398. /// Corresponds to the Unicode designations "Mn", "Mc" and "Me".
  399. /// </summary>
  400. CombiningMark
  401. }
  402. }