PageRenderTime 44ms CodeModel.GetById 14ms app.highlight 25ms RepoModel.GetById 2ms app.codeStats 0ms

/AvalonEdit/ICSharpCode.AvalonEdit/Document/TextUtilities.cs

http://github.com/icsharpcode/ILSpy
C# | 422 lines | 266 code | 20 blank | 136 comment | 109 complexity | cd9dfe6f1c63a99181d2a94498ba3b89 MD5 | raw file
  1// Copyright (c) 2014 AlphaSierraPapa for the SharpDevelop Team
  2// 
  3// Permission is hereby granted, free of charge, to any person obtaining a copy of this
  4// software and associated documentation files (the "Software"), to deal in the Software
  5// without restriction, including without limitation the rights to use, copy, modify, merge,
  6// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
  7// to whom the Software is furnished to do so, subject to the following conditions:
  8// 
  9// The above copyright notice and this permission notice shall be included in all copies or
 10// substantial portions of the Software.
 11// 
 12// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
 13// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
 14// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
 15// FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 16// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 17// DEALINGS IN THE SOFTWARE.
 18
 19using System;
 20using System.Globalization;
 21using System.Windows.Documents;
 22#if NREFACTORY
 23using ICSharpCode.NRefactory.Editor;
 24#endif
 25
 26namespace ICSharpCode.AvalonEdit.Document
 27{
 28	/// <summary>
 29	/// Specifies the mode for getting the next caret position.
 30	/// </summary>
 31	public enum CaretPositioningMode
 32	{
 33		/// <summary>
 34		/// Normal positioning (stop after every grapheme)
 35		/// </summary>
 36		Normal,
 37		/// <summary>
 38		/// Stop only on word borders.
 39		/// </summary>
 40		WordBorder,
 41		/// <summary>
 42		/// Stop only at the beginning of words. This is used for Ctrl+Left/Ctrl+Right.
 43		/// </summary>
 44		WordStart,
 45		/// <summary>
 46		/// Stop only at the beginning of words, and anywhere in the middle of symbols.
 47		/// </summary>
 48		WordStartOrSymbol,
 49		/// <summary>
 50		/// Stop only on word borders, and anywhere in the middle of symbols.
 51		/// </summary>
 52		WordBorderOrSymbol,
 53		/// <summary>
 54		/// Stop between every Unicode codepoint, even within the same grapheme.
 55		/// This is used to implement deleting the previous grapheme when Backspace is pressed.
 56		/// </summary>
 57		EveryCodepoint
 58	}
 59	
 60	/// <summary>
 61	/// Static helper methods for working with text.
 62	/// </summary>
 63	public static partial class TextUtilities
 64	{
 65		#region GetControlCharacterName
 66		// the names of the first 32 ASCII characters = Unicode C0 block
 67		static readonly string[] c0Table = {
 68			"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "BEL", "BS", "HT",
 69			"LF", "VT", "FF", "CR", "SO", "SI", "DLE", "DC1", "DC2", "DC3",
 70			"DC4", "NAK", "SYN", "ETB", "CAN", "EM", "SUB", "ESC", "FS", "GS",
 71			"RS", "US"
 72		};
 73		
 74		// DEL (ASCII 127) and
 75		// the names of the control characters in the C1 block (Unicode 128 to 159)
 76		static readonly string[] delAndC1Table = {
 77			"DEL",
 78			"PAD", "HOP", "BPH", "NBH", "IND", "NEL", "SSA", "ESA", "HTS", "HTJ",
 79			"VTS", "PLD", "PLU", "RI", "SS2", "SS3", "DCS", "PU1", "PU2", "STS",
 80			"CCH", "MW", "SPA", "EPA", "SOS", "SGCI", "SCI", "CSI", "ST", "OSC",
 81			"PM", "APC"
 82		};
 83		
 84		/// <summary>
 85		/// Gets the name of the control character.
 86		/// For unknown characters, the unicode codepoint is returned as 4-digit hexadecimal value.
 87		/// </summary>
 88		public static string GetControlCharacterName(char controlCharacter)
 89		{
 90			int num = (int)controlCharacter;
 91			if (num < c0Table.Length)
 92				return c0Table[num];
 93			else if (num >= 127 && num <= 159)
 94				return delAndC1Table[num - 127];
 95			else
 96				return num.ToString("x4", CultureInfo.InvariantCulture);
 97		}
 98		#endregion
 99		
100		#region GetWhitespace
101		/// <summary>
102		/// Gets all whitespace (' ' and '\t', but no newlines) after offset.
103		/// </summary>
104		/// <param name="textSource">The text source.</param>
105		/// <param name="offset">The offset where the whitespace starts.</param>
106		/// <returns>The segment containing the whitespace.</returns>
107		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
108		                                                 Justification = "WPF uses 'Whitespace'")]
109		public static ISegment GetWhitespaceAfter(ITextSource textSource, int offset)
110		{
111			if (textSource == null)
112				throw new ArgumentNullException("textSource");
113			int pos;
114			for (pos = offset; pos < textSource.TextLength; pos++) {
115				char c = textSource.GetCharAt(pos);
116				if (c != ' ' && c != '\t')
117					break;
118			}
119			return new SimpleSegment(offset, pos - offset);
120		}
121		
122		/// <summary>
123		/// Gets all whitespace (' ' and '\t', but no newlines) before offset.
124		/// </summary>
125		/// <param name="textSource">The text source.</param>
126		/// <param name="offset">The offset where the whitespace ends.</param>
127		/// <returns>The segment containing the whitespace.</returns>
128		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
129		                                                 Justification = "WPF uses 'Whitespace'")]
130		public static ISegment GetWhitespaceBefore(ITextSource textSource, int offset)
131		{
132			if (textSource == null)
133				throw new ArgumentNullException("textSource");
134			int pos;
135			for (pos = offset - 1; pos >= 0; pos--) {
136				char c = textSource.GetCharAt(pos);
137				if (c != ' ' && c != '\t')
138					break;
139			}
140			pos++; // go back the one character that isn't whitespace
141			return new SimpleSegment(pos, offset - pos);
142		}
143		
144		/// <summary>
145		/// Gets the leading whitespace segment on the document line.
146		/// </summary>
147		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
148		                                                 Justification = "WPF uses 'Whitespace'")]
149		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1011:ConsiderPassingBaseTypesAsParameters",
150		                                                 Justification = "Parameter cannot be ITextSource because it must belong to the DocumentLine")]
151		public static ISegment GetLeadingWhitespace(TextDocument document, DocumentLine documentLine)
152		{
153			if (documentLine == null)
154				throw new ArgumentNullException("documentLine");
155			return GetWhitespaceAfter(document, documentLine.Offset);
156		}
157		
158		/// <summary>
159		/// Gets the trailing whitespace segment on the document line.
160		/// </summary>
161		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
162		                                                 Justification = "WPF uses 'Whitespace'")]
163		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1011:ConsiderPassingBaseTypesAsParameters",
164		                                                 Justification = "Parameter cannot be ITextSource because it must belong to the DocumentLine")]
165		public static ISegment GetTrailingWhitespace(TextDocument document, DocumentLine documentLine)
166		{
167			if (documentLine == null)
168				throw new ArgumentNullException("documentLine");
169			ISegment segment = GetWhitespaceBefore(document, documentLine.EndOffset);
170			// If the whole line consists of whitespace, we consider all of it as leading whitespace,
171			// so return an empty segment as trailing whitespace.
172			if (segment.Offset == documentLine.Offset)
173				return new SimpleSegment(documentLine.EndOffset, 0);
174			else
175				return segment;
176		}
177		#endregion
178		
179		#region GetSingleIndentationSegment
180		/// <summary>
181		/// Gets a single indentation segment starting at <paramref name="offset"/> - at most one tab
182		/// or <paramref name="indentationSize"/> spaces.
183		/// </summary>
184		/// <param name="textSource">The text source.</param>
185		/// <param name="offset">The offset where the indentation segment starts.</param>
186		/// <param name="indentationSize">The size of an indentation unit. See <see cref="TextEditorOptions.IndentationSize"/>.</param>
187		/// <returns>The indentation segment.
188		/// If there is no indentation character at the specified <paramref name="offset"/>,
189		/// an empty segment is returned.</returns>
190		public static ISegment GetSingleIndentationSegment(ITextSource textSource, int offset, int indentationSize)
191		{
192			if (textSource == null)
193				throw new ArgumentNullException("textSource");
194			int pos = offset;
195			while (pos < textSource.TextLength) {
196				char c = textSource.GetCharAt(pos);
197				if (c == '\t') {
198					if (pos == offset)
199						return new SimpleSegment(offset, 1);
200					else
201						break;
202				} else if (c == ' ') {
203					if (pos - offset >= indentationSize)
204						break;
205				} else {
206					break;
207				}
208				// continue only if c==' ' and (pos-offset)<tabSize
209				pos++;
210			}
211			return new SimpleSegment(offset, pos - offset);
212		}
213		#endregion
214		
215		#region GetCharacterClass
216		/// <summary>
217		/// Gets whether the character is whitespace, part of an identifier, or line terminator.
218		/// </summary>
219		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "c")]
220		public static CharacterClass GetCharacterClass(char c)
221		{
222			if (c == '\r' || c == '\n')
223				return CharacterClass.LineTerminator;
224			if (c == '_')
225				return CharacterClass.IdentifierPart;
226			return GetCharacterClass(char.GetUnicodeCategory(c));
227		}
228		
229		static CharacterClass GetCharacterClass(char highSurrogate, char lowSurrogate)
230		{
231			if (char.IsSurrogatePair(highSurrogate, lowSurrogate)) {
232				return GetCharacterClass(char.GetUnicodeCategory(highSurrogate.ToString() + lowSurrogate.ToString(), 0));
233			} else {
234				// malformed surrogate pair
235				return CharacterClass.Other;
236			}
237		}
238		
239		static CharacterClass GetCharacterClass(UnicodeCategory c)
240		{
241			switch (c) {
242				case UnicodeCategory.SpaceSeparator:
243				case UnicodeCategory.LineSeparator:
244				case UnicodeCategory.ParagraphSeparator:
245				case UnicodeCategory.Control:
246					return CharacterClass.Whitespace;
247				case UnicodeCategory.UppercaseLetter:
248				case UnicodeCategory.LowercaseLetter:
249				case UnicodeCategory.TitlecaseLetter:
250				case UnicodeCategory.ModifierLetter:
251				case UnicodeCategory.OtherLetter:
252				case UnicodeCategory.DecimalDigitNumber:
253					return CharacterClass.IdentifierPart;
254				case UnicodeCategory.NonSpacingMark:
255				case UnicodeCategory.SpacingCombiningMark:
256				case UnicodeCategory.EnclosingMark:
257					return CharacterClass.CombiningMark;
258				default:
259					return CharacterClass.Other;
260			}
261		}
262		#endregion
263		
264		#region GetNextCaretPosition
265		/// <summary>
266		/// Gets the next caret position.
267		/// </summary>
268		/// <param name="textSource">The text source.</param>
269		/// <param name="offset">The start offset inside the text source.</param>
270		/// <param name="direction">The search direction (forwards or backwards).</param>
271		/// <param name="mode">The mode for caret positioning.</param>
272		/// <returns>The offset of the next caret position, or -1 if there is no further caret position
273		/// in the text source.</returns>
274		/// <remarks>
275		/// This method is NOT equivalent to the actual caret movement when using VisualLine.GetNextCaretPosition.
276		/// In real caret movement, there are additional caret stops at line starts and ends. This method
277		/// treats linefeeds as simple whitespace.
278		/// </remarks>
279		public static int GetNextCaretPosition(ITextSource textSource, int offset, LogicalDirection direction, CaretPositioningMode mode)
280		{
281			if (textSource == null)
282				throw new ArgumentNullException("textSource");
283			switch (mode) {
284				case CaretPositioningMode.Normal:
285				case CaretPositioningMode.EveryCodepoint:
286				case CaretPositioningMode.WordBorder:
287				case CaretPositioningMode.WordBorderOrSymbol:
288				case CaretPositioningMode.WordStart:
289				case CaretPositioningMode.WordStartOrSymbol:
290					break; // OK
291				default:
292					throw new ArgumentException("Unsupported CaretPositioningMode: " + mode, "mode");
293			}
294			if (direction != LogicalDirection.Backward
295			    && direction != LogicalDirection.Forward)
296			{
297				throw new ArgumentException("Invalid LogicalDirection: " + direction, "direction");
298			}
299			int textLength = textSource.TextLength;
300			if (textLength <= 0) {
301				// empty document? has a normal caret position at 0, though no word borders
302				if (IsNormal(mode)) {
303					if (offset > 0 && direction == LogicalDirection.Backward) return 0;
304					if (offset < 0 && direction == LogicalDirection.Forward) return 0;
305				}
306				return -1;
307			}
308			while (true) {
309				int nextPos = (direction == LogicalDirection.Backward) ? offset - 1 : offset + 1;
310				
311				// return -1 if there is no further caret position in the text source
312				// we also need this to handle offset values outside the valid range
313				if (nextPos < 0 || nextPos > textLength)
314					return -1;
315				
316				// check if we've run against the textSource borders.
317				// a 'textSource' usually isn't the whole document, but a single VisualLineElement.
318				if (nextPos == 0) {
319					// at the document start, there's only a word border
320					// if the first character is not whitespace
321					if (IsNormal(mode) || !char.IsWhiteSpace(textSource.GetCharAt(0)))
322						return nextPos;
323				} else if (nextPos == textLength) {
324					// at the document end, there's never a word start
325					if (mode != CaretPositioningMode.WordStart && mode != CaretPositioningMode.WordStartOrSymbol) {
326						// at the document end, there's only a word border
327						// if the last character is not whitespace
328						if (IsNormal(mode) || !char.IsWhiteSpace(textSource.GetCharAt(textLength - 1)))
329							return nextPos;
330					}
331				} else {
332					char charBefore = textSource.GetCharAt(nextPos - 1);
333					char charAfter = textSource.GetCharAt(nextPos);
334					// Don't stop in the middle of a surrogate pair
335					if (!char.IsSurrogatePair(charBefore, charAfter)) {
336						CharacterClass classBefore = GetCharacterClass(charBefore);
337						CharacterClass classAfter = GetCharacterClass(charAfter);
338						// get correct class for characters outside BMP:
339						if (char.IsLowSurrogate(charBefore) && nextPos >= 2) {
340							classBefore = GetCharacterClass(textSource.GetCharAt(nextPos - 2), charBefore);
341						}
342						if (char.IsHighSurrogate(charAfter) && nextPos + 1 < textLength) {
343							classAfter = GetCharacterClass(charAfter, textSource.GetCharAt(nextPos + 1));
344						}
345						if (StopBetweenCharacters(mode, classBefore, classAfter)) {
346							return nextPos;
347						}
348					}
349				}
350				// we'll have to continue searching...
351				offset = nextPos;
352			}
353		}
354		
355		static bool IsNormal(CaretPositioningMode mode)
356		{
357			return mode == CaretPositioningMode.Normal || mode == CaretPositioningMode.EveryCodepoint;
358		}
359		
360		static bool StopBetweenCharacters(CaretPositioningMode mode, CharacterClass charBefore, CharacterClass charAfter)
361		{
362			if (mode == CaretPositioningMode.EveryCodepoint)
363				return true;
364			// Don't stop in the middle of a grapheme
365			if (charAfter == CharacterClass.CombiningMark)
366				return false;
367			// Stop after every grapheme in normal mode
368			if (mode == CaretPositioningMode.Normal)
369				return true;
370			if (charBefore == charAfter) {
371				if (charBefore == CharacterClass.Other &&
372				    (mode == CaretPositioningMode.WordBorderOrSymbol || mode == CaretPositioningMode.WordStartOrSymbol))
373				{
374					// With the "OrSymbol" modes, there's a word border and start between any two unknown characters
375					return true;
376				}
377			} else {
378				// this looks like a possible border
379				
380				// if we're looking for word starts, check that this is a word start (and not a word end)
381				// if we're just checking for word borders, accept unconditionally
382				if (!((mode == CaretPositioningMode.WordStart || mode == CaretPositioningMode.WordStartOrSymbol)
383				      && (charAfter == CharacterClass.Whitespace || charAfter == CharacterClass.LineTerminator)))
384				{
385					return true;
386				}
387			}
388			return false;
389		}
390		#endregion
391	}
392	
393	/// <summary>
394	/// Classifies a character as whitespace, line terminator, part of an identifier, or other.
395	/// </summary>
396	public enum CharacterClass
397	{
398		/// <summary>
399		/// The character is not whitespace, line terminator or part of an identifier.
400		/// </summary>
401		Other,
402		/// <summary>
403		/// The character is whitespace (but not line terminator).
404		/// </summary>
405		[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1702:CompoundWordsShouldBeCasedCorrectly", MessageId = "Whitespace",
406		                                                 Justification = "WPF uses 'Whitespace'")]
407		Whitespace,
408		/// <summary>
409		/// The character can be part of an identifier (Letter, digit or underscore).
410		/// </summary>
411		IdentifierPart,
412		/// <summary>
413		/// The character is line terminator (\r or \n).
414		/// </summary>
415		LineTerminator,
416		/// <summary>
417		/// The character is a unicode combining mark that modifies the previous character.
418		/// Corresponds to the Unicode designations "Mn", "Mc" and "Me".
419		/// </summary>
420		CombiningMark
421	}
422}