StringBreaker.cs | searchcode

/src/Workspaces/Core/Portable/Shared/Utilities/StringBreaker.cs

https://gitlab.com/sharadag/Roslyn
C# | 327 lines | 228 code | 49 blank | 50 comment | 41 complexity | 6caf382e6259e0e965b29b0ff60ccb89 MD5 | raw file

// Copyright (c) Microsoft.  All Rights Reserved.  Licensed under the Apache License, Version 2.0.  See License.txt in the project root for license information.

using System;
using System.Collections.Generic;
using System.Diagnostics;
using Microsoft.CodeAnalysis.Text;

namespace Microsoft.CodeAnalysis.Shared.Utilities
{
    /// <summary>
    /// Values returned from <see cref="StringBreaker"/> routines.
    /// Optimized for short strings with a handful of spans.
    /// Each span is encoded in two bitfields 'gap' and 'length' and these
    /// bitfields are stored in a 32-bit bitmap.
    /// Falls back to a <see cref="List{T}"/> if the encoding won't work.
    /// </summary>
    internal struct StringBreaks
    {
        private readonly List<TextSpan> _spans;
        private readonly EncodedSpans _encodedSpans;

        // These two values may be adjusted. The remaining constants are
        // derived from them. The values are chosen to minimize the number
        // of fallbacks during normal typing. With 5 total bits per span, we
        // can encode up to 6 spans, each as long as 15 chars with 0 or 1 char
        // gap. This is sufficient for the vast majority of framework symbols.
        private const int BitsForGap = 1;
        private const int BitsForLength = 4;

        private const int BitsPerEncodedSpan = BitsForGap + BitsForLength;
        private const int MaxShortSpans = 32 / BitsPerEncodedSpan;
        private const int MaxGap = (1 << BitsForGap) - 1;
        private const int MaxLength = (1 << BitsForLength) - 1;

        private struct EncodedSpans
        {
            private const uint Mask = (1u << BitsPerEncodedSpan) - 1u;
            private uint _value;

            public byte this[int index]
            {
                get
                {
                    Debug.Assert(index >= 0 && index < MaxShortSpans);
                    return (byte)((_value >> (index * BitsPerEncodedSpan)) & Mask);
                }
                set
                {
                    Debug.Assert(index >= 0 && index < MaxShortSpans);
                    int shift = index * BitsPerEncodedSpan;
                    _value = (_value & ~(Mask << shift)) | ((uint)value << shift);
                }
            }
        }

        public static StringBreaks Create(string text, Func<string, int, TextSpan> spanGenerator)
        {
            Debug.Assert(text != null);
            Debug.Assert(spanGenerator != null);

            EncodedSpans encodedSpans;
            return TryEncodeSpans(text, spanGenerator, out encodedSpans)
                ? new StringBreaks(encodedSpans)
                : new StringBreaks(CreateFallbackList(text, spanGenerator));
        }

        private static bool TryEncodeSpans(string text, Func<string, int, TextSpan> spanGenerator, out EncodedSpans encodedSpans)
        {
            encodedSpans = default(EncodedSpans);
            for (int start = 0, b = 0; start < text.Length;)
            {
                var span = spanGenerator(text, start);
                if (span.IsEmpty)
                {
                    // All done
                    break;
                }

                int gap = span.Start - start;
                Debug.Assert(gap >= 0, "Bad generator.");

                if (b >= MaxShortSpans ||
                    span.Length > MaxLength ||
                    gap > MaxGap)
                {
                    // Too many spans, or span cannot be encoded.
                    return false;
                }

                encodedSpans[b++] = Encode(gap, span.Length);
                start = span.End;
            }

            return true;
        }

        private static List<TextSpan> CreateFallbackList(string text, Func<string, int, TextSpan> spanGenerator)
        {
            List<TextSpan> list = new List<TextSpan>();
            for (int start = 0; start < text.Length;)
            {
                var span = spanGenerator(text, start);
                if (span.IsEmpty)
                {
                    // All done
                    break;
                }

                Debug.Assert(span.Start >= start, "Bad generator.");

                list.Add(span);
                start = span.End;
            }

            return list;
        }

        private StringBreaks(EncodedSpans encodedSpans)
        {
            _encodedSpans = encodedSpans;
            _spans = null;
        }

        private StringBreaks(List<TextSpan> spans)
        {
            _encodedSpans = default(EncodedSpans);
            _spans = spans;
        }

        public int Count
        {
            get
            {
                if (_spans != null)
                {
                    return _spans.Count;
                }

                int i;
                for (i = 0; i < MaxShortSpans; i++)
                {
                    if (_encodedSpans[i] == 0) break;
                }

                return i;
            }
        }

        public TextSpan this[int index]
        {
            get
            {
                if (index < 0)
                {
                    throw new IndexOutOfRangeException(nameof(index));
                }

                if (_spans != null)
                {
                    return _spans[index];
                }

                for (int i = 0, start = 0; i < MaxShortSpans; i++)
                {
                    byte b = _encodedSpans[i];
                    if (b == 0)
                    {
                        break;
                    }

                    start += DecodeGap(b);
                    int length = DecodeLength(b);
                    if (i == index)
                    {
                        return new TextSpan(start, length);
                    }

                    start += length;
                }

                throw new IndexOutOfRangeException(nameof(index));
            }
        }

        private static byte Encode(int gap, int length)
        {
            Debug.Assert(gap >= 0 && gap <= MaxGap);
            Debug.Assert(length >= 0 && length <= MaxLength);
            return unchecked((byte)((gap << BitsForLength) | length));
        }

        private static int DecodeLength(byte b) => b & MaxLength;

        private static int DecodeGap(byte b) => b >> BitsForLength;
    }

    internal static class StringBreaker
    {
        /// <summary>
        /// Breaks an identifier string into constituent parts.
        /// </summary>
        public static StringBreaks BreakIntoCharacterParts(string identifier) => StringBreaks.Create(identifier, s_characterPartsGenerator);

        /// <summary>
        /// Breaks an identifier string into constituent parts.
        /// </summary>
        public static StringBreaks BreakIntoWordParts(string identifier) => StringBreaks.Create(identifier, s_wordPartsGenerator);

        private static readonly Func<string, int, TextSpan> s_characterPartsGenerator = (identifier, start) => GenerateSpan(identifier, start, word: false);

        private static readonly Func<string, int, TextSpan> s_wordPartsGenerator = (identifier, start) => GenerateSpan(identifier, start, word: true);

        public static TextSpan GenerateSpan(string identifier, int wordStart, bool word)
        {
            for (int i = wordStart + 1; i < identifier.Length; i++)
            {
                var lastIsDigit = char.IsDigit(identifier[i - 1]);
                var currentIsDigit = char.IsDigit(identifier[i]);

                var transitionFromLowerToUpper = TransitionFromLowerToUpper(identifier, word, i);
                var transitionFromUpperToLower = TransitionFromUpperToLower(identifier, word, i, wordStart);

                if (char.IsPunctuation(identifier[i - 1]) ||
                    char.IsPunctuation(identifier[i]) ||
                    lastIsDigit != currentIsDigit ||
                    transitionFromLowerToUpper ||
                    transitionFromUpperToLower)
                {
                    if (!IsAllPunctuation(identifier, wordStart, i))
                    {
                        return new TextSpan(wordStart, i - wordStart);
                    }

                    wordStart = i;
                }
            }

            if (!IsAllPunctuation(identifier, wordStart, identifier.Length))
            {
                return new TextSpan(wordStart, identifier.Length - wordStart);
            }

            return default(TextSpan);
        }

        private static bool IsAllPunctuation(string identifier, int start, int end)
        {
            for (int i = start; i < end; i++)
            {
                var ch = identifier[i];

                // We don't consider _ as punctuation as there may be things with that name.
                if (!char.IsPunctuation(ch) || ch == '_')
                {
                    return false;
                }
            }

            return true;
        }

        private static bool TransitionFromUpperToLower(string identifier, bool word, int index, int wordStart)
        {
            if (word)
            {
                // Cases this supports:
                // 1) IDisposable -> I, Disposable
                // 2) UIElement -> UI, Element
                // 3) HTMLDocument -> HTML, Document
                //
                // etc.
                if (index != wordStart &&
                    index + 1 < identifier.Length)
                {
                    var currentIsUpper = char.IsUpper(identifier[index]);
                    var nextIsLower = char.IsLower(identifier[index + 1]);
                    if (currentIsUpper && nextIsLower)
                    {
                        // We have a transition from an upper to a lower letter here.  But we only
                        // want to break if all the letters that preceded are uppercase.  i.e. if we
                        // have "Foo" we don't want to break that into "F, oo".  But if we have
                        // "IFoo" or "UIFoo", then we want to break that into "I, Foo" and "UI,
                        // Foo".  i.e. the last uppercase letter belongs to the lowercase letters
                        // that follows.  Note: this will make the following not split properly:
                        // "HELLOthere".  However, these sorts of names do not show up in .Net
                        // programs.
                        for (int i = wordStart; i < index; i++)
                        {
                            if (!char.IsUpper(identifier[i]))
                            {
                                return false;
                            }
                        }

                        return true;
                    }
                }
            }

            return false;
        }

        private static bool TransitionFromLowerToUpper(string identifier, bool word, int index)
        {
            var lastIsUpper = char.IsUpper(identifier[index - 1]);
            var currentIsUpper = char.IsUpper(identifier[index]);

            // See if the casing indicates we're starting a new word. Note: if we're breaking on
            // words, then just seeing an upper case character isn't enough.  Instead, it has to
            // be uppercase and the previous character can't be uppercase. 
            //
            // For example, breaking "AddMetadata" on words would make: Add Metadata
            //
            // on characters would be: A dd M etadata
            //
            // Break "AM" on words would be: AM
            //
            // on characters would be: A M
            //
            // We break the search string on characters.  But we break the symbol name on words.
            var transition = word
                ? (currentIsUpper && !lastIsUpper)
                : currentIsUpper;
            return transition;
        }
    }
}