PageRenderTime 42ms CodeModel.GetById 31ms app.highlight 8ms RepoModel.GetById 0ms app.codeStats 0ms

/third_party/rust/regex/src/re_set.rs

https://bitbucket.org/vionika/spin.android
Rust | 431 lines | 164 code | 33 blank | 234 comment | 8 complexity | a8ea737b0bd095a38fda6185acc9eafc MD5 | raw file
Possible License(s): JSON, 0BSD, AGPL-1.0, BSD-2-Clause, GPL-3.0, LGPL-2.1, LGPL-3.0, CC0-1.0, AGPL-3.0, MPL-2.0, Apache-2.0, MIT, BSD-3-Clause, MPL-2.0-no-copyleft-exception, GPL-2.0, Unlicense
  1// Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
  2// file at the top-level directory of this distribution and at
  3// http://rust-lang.org/COPYRIGHT.
  4//
  5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
  6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
  7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
  8// option. This file may not be copied, modified, or distributed
  9// except according to those terms.
 10
 11macro_rules! define_set {
 12    ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
 13     $(#[$doc_regexset_example:meta])* ) => {
 14        pub mod $name {
 15            use std::fmt;
 16            use std::iter;
 17            use std::slice;
 18            use std::vec;
 19
 20            use error::Error;
 21            use exec::Exec;
 22            use re_builder::$builder_mod::RegexSetBuilder;
 23            use re_trait::RegularExpression;
 24
 25/// Match multiple (possibly overlapping) regular expressions in a single scan.
 26///
 27/// A regex set corresponds to the union of two or more regular expressions.
 28/// That is, a regex set will match text where at least one of its
 29/// constituent regular expressions matches. A regex set as its formulated here
 30/// provides a touch more power: it will also report *which* regular
 31/// expressions in the set match. Indeed, this is the key difference between
 32/// regex sets and a single `Regex` with many alternates, since only one
 33/// alternate can match at a time.
 34///
 35/// For example, consider regular expressions to match email addresses and
 36/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
 37/// regex set is constructed from those regexes, then searching the text
 38/// `foo@example.com` will report both regexes as matching. Of course, one
 39/// could accomplish this by compiling each regex on its own and doing two
 40/// searches over the text. The key advantage of using a regex set is that it
 41/// will report the matching regexes using a *single pass through the text*.
 42/// If one has hundreds or thousands of regexes to match repeatedly (like a URL
 43/// router for a complex web application or a user agent matcher), then a regex
 44/// set can realize huge performance gains.
 45///
 46/// # Example
 47///
 48/// This shows how the above two regexes (for matching email addresses and
 49/// domains) might work:
 50///
 51$(#[$doc_regexset_example])*
 52///
 53/// Note that it would be possible to adapt the above example to using `Regex`
 54/// with an expression like:
 55///
 56/// ```ignore
 57/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
 58/// ```
 59///
 60/// After a match, one could then inspect the capture groups to figure out
 61/// which alternates matched. The problem is that it is hard to make this
 62/// approach scale when there are many regexes since the overlap between each
 63/// alternate isn't always obvious to reason about.
 64///
 65/// # Limitations
 66///
 67/// Regex sets are limited to answering the following two questions:
 68///
 69/// 1. Does any regex in the set match?
 70/// 2. If so, which regexes in the set match?
 71///
 72/// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
 73/// since the matching engines can stop after the first match is found.
 74///
 75/// Other features like finding the location of successive matches or their
 76/// sub-captures aren't supported. If you need this functionality, the
 77/// recommended approach is to compile each regex in the set independently and
 78/// selectively match them based on which regexes in the set matched.
 79///
 80/// # Performance
 81///
 82/// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
 83/// search takes `O(mn)` time, where `m` is proportional to the size of the
 84/// regex set and `n` is proportional to the length of the search text.
 85#[derive(Clone)]
 86pub struct RegexSet(Exec);
 87
 88impl RegexSet {
 89    /// Create a new regex set with the given regular expressions.
 90    ///
 91    /// This takes an iterator of `S`, where `S` is something that can produce
 92    /// a `&str`. If any of the strings in the iterator are not valid regular
 93    /// expressions, then an error is returned.
 94    ///
 95    /// # Example
 96    ///
 97    /// Create a new regex set from an iterator of strings:
 98    ///
 99    /// ```rust
100    /// # use regex::RegexSet;
101    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
102    /// assert!(set.is_match("foo"));
103    /// ```
104    pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
105            where S: AsRef<str>, I: IntoIterator<Item=S> {
106        RegexSetBuilder::new(exprs).build()
107    }
108
109    /// Returns true if and only if one of the regexes in this set matches
110    /// the text given.
111    ///
112    /// This method should be preferred if you only need to test whether any
113    /// of the regexes in the set should match, but don't care about *which*
114    /// regexes matched. This is because the underlying matching engine will
115    /// quit immediately after seeing the first match instead of continuing to
116    /// find all matches.
117    ///
118    /// Note that as with searches using `Regex`, the expression is unanchored
119    /// by default. That is, if the regex does not start with `^` or `\A`, or
120    /// end with `$` or `\z`, then it is permitted to match anywhere in the
121    /// text.
122    ///
123    /// # Example
124    ///
125    /// Tests whether a set matches some text:
126    ///
127    /// ```rust
128    /// # use regex::RegexSet;
129    /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
130    /// assert!(set.is_match("foo"));
131    /// assert!(!set.is_match("☃"));
132    /// ```
133    pub fn is_match(&self, text: $text_ty) -> bool {
134        self.is_match_at(text, 0)
135    }
136
137    /// Returns the same as is_match, but starts the search at the given
138    /// offset.
139    ///
140    /// The significance of the starting point is that it takes the surrounding
141    /// context into consideration. For example, the `\A` anchor can only
142    /// match when `start == 0`.
143    #[doc(hidden)]
144    pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
145        self.0.searcher().is_match_at($as_bytes(text), start)
146    }
147
148    /// Returns the set of regular expressions that match in the given text.
149    ///
150    /// The set returned contains the index of each regular expression that
151    /// matches in the given text. The index is in correspondence with the
152    /// order of regular expressions given to `RegexSet`'s constructor.
153    ///
154    /// The set can also be used to iterate over the matched indices.
155    ///
156    /// Note that as with searches using `Regex`, the expression is unanchored
157    /// by default. That is, if the regex does not start with `^` or `\A`, or
158    /// end with `$` or `\z`, then it is permitted to match anywhere in the
159    /// text.
160    ///
161    /// # Example
162    ///
163    /// Tests which regular expressions match the given text:
164    ///
165    /// ```rust
166    /// # use regex::RegexSet;
167    /// let set = RegexSet::new(&[
168    ///     r"\w+",
169    ///     r"\d+",
170    ///     r"\pL+",
171    ///     r"foo",
172    ///     r"bar",
173    ///     r"barfoo",
174    ///     r"foobar",
175    /// ]).unwrap();
176    /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
177    /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
178    ///
179    /// // You can also test whether a particular regex matched:
180    /// let matches = set.matches("foobar");
181    /// assert!(!matches.matched(5));
182    /// assert!(matches.matched(6));
183    /// ```
184    pub fn matches(&self, text: $text_ty) -> SetMatches {
185        let mut matches = vec![false; self.0.regex_strings().len()];
186        let any = self.read_matches_at(&mut matches, text, 0);
187        SetMatches {
188            matched_any: any,
189            matches: matches,
190        }
191    }
192
193    /// Returns the same as matches, but starts the search at the given
194    /// offset and stores the matches into the slice given.
195    ///
196    /// The significance of the starting point is that it takes the surrounding
197    /// context into consideration. For example, the `\A` anchor can only
198    /// match when `start == 0`.
199    ///
200    /// `matches` must have a length that is at least the number of regexes
201    /// in this set.
202    ///
203    /// This method returns true if and only if at least one member of
204    /// `matches` is true after executing the set against `text`.
205    #[doc(hidden)]
206    pub fn read_matches_at(
207        &self,
208        matches: &mut [bool],
209        text: $text_ty,
210        start: usize,
211    ) -> bool {
212        self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
213    }
214
215    /// Returns the total number of regular expressions in this set.
216    pub fn len(&self) -> usize {
217        self.0.regex_strings().len()
218    }
219}
220
221/// A set of matches returned by a regex set.
222#[derive(Clone, Debug)]
223pub struct SetMatches {
224    matched_any: bool,
225    matches: Vec<bool>,
226}
227
228impl SetMatches {
229    /// Whether this set contains any matches.
230    pub fn matched_any(&self) -> bool {
231        self.matched_any
232    }
233
234    /// Whether the regex at the given index matched.
235    ///
236    /// The index for a regex is determined by its insertion order upon the
237    /// initial construction of a `RegexSet`, starting at `0`.
238    ///
239    /// # Panics
240    ///
241    /// If `regex_index` is greater than or equal to `self.len()`.
242    pub fn matched(&self, regex_index: usize) -> bool {
243        self.matches[regex_index]
244    }
245
246    /// The total number of regexes in the set that created these matches.
247    pub fn len(&self) -> usize {
248        self.matches.len()
249    }
250
251    /// Returns an iterator over indexes in the regex that matched.
252    ///
253    /// This will always produces matches in ascending order of index, where
254    /// the index corresponds to the index of the regex that matched with
255    /// respect to its position when initially building the set.
256    pub fn iter(&self) -> SetMatchesIter {
257        SetMatchesIter((&*self.matches).into_iter().enumerate())
258    }
259}
260
261impl IntoIterator for SetMatches {
262    type IntoIter = SetMatchesIntoIter;
263    type Item = usize;
264
265    fn into_iter(self) -> Self::IntoIter {
266        SetMatchesIntoIter(self.matches.into_iter().enumerate())
267    }
268}
269
270impl<'a> IntoIterator for &'a SetMatches {
271    type IntoIter = SetMatchesIter<'a>;
272    type Item = usize;
273
274    fn into_iter(self) -> Self::IntoIter {
275        self.iter()
276    }
277}
278
279/// An owned iterator over the set of matches from a regex set.
280///
281/// This will always produces matches in ascending order of index, where the
282/// index corresponds to the index of the regex that matched with respect to
283/// its position when initially building the set.
284pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
285
286impl Iterator for SetMatchesIntoIter {
287    type Item = usize;
288
289    fn next(&mut self) -> Option<usize> {
290        loop {
291            match self.0.next() {
292                None => return None,
293                Some((_, false)) => {}
294                Some((i, true)) => return Some(i),
295            }
296        }
297    }
298
299    fn size_hint(&self) -> (usize, Option<usize>) {
300        self.0.size_hint() 
301    }
302}
303
304impl DoubleEndedIterator for SetMatchesIntoIter {
305    fn next_back(&mut self) -> Option<usize> {
306        loop {
307            match self.0.next_back() {
308                None => return None,
309                Some((_, false)) => {}
310                Some((i, true)) => return Some(i),
311            }
312        }
313    }
314}
315
316/// A borrowed iterator over the set of matches from a regex set.
317///
318/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
319///
320/// This will always produces matches in ascending order of index, where the
321/// index corresponds to the index of the regex that matched with respect to
322/// its position when initially building the set.
323#[derive(Clone)]
324pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
325
326impl<'a> Iterator for SetMatchesIter<'a> {
327    type Item = usize;
328
329    fn next(&mut self) -> Option<usize> {
330        loop {
331            match self.0.next() {
332                None => return None,
333                Some((_, &false)) => {}
334                Some((i, &true)) => return Some(i),
335            }
336        }
337    }
338
339    fn size_hint(&self) -> (usize, Option<usize>) {
340        self.0.size_hint() 
341    }
342}
343
344impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
345    fn next_back(&mut self) -> Option<usize> {
346        loop {
347            match self.0.next_back() {
348                None => return None,
349                Some((_, &false)) => {}
350                Some((i, &true)) => return Some(i),
351            }
352        }
353    }
354}
355
356#[doc(hidden)]
357impl From<Exec> for RegexSet {
358    fn from(exec: Exec) -> Self {
359        RegexSet(exec)
360    }
361}
362
363impl fmt::Debug for RegexSet {
364    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
365        write!(f, "RegexSet({:?})", self.0.regex_strings())
366    }
367}
368
369#[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
370#[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
371        }
372    }
373}
374
375define_set! {
376    unicode,
377    set_unicode,
378    &str,
379    as_bytes_str,
380/// ```rust
381/// # use regex::RegexSet;
382/// let set = RegexSet::new(&[
383///     r"[a-z]+@[a-z]+\.(com|org|net)",
384///     r"[a-z]+\.(com|org|net)",
385/// ]).unwrap();
386///
387/// // Ask whether any regexes in the set match.
388/// assert!(set.is_match("foo@example.com"));
389///
390/// // Identify which regexes in the set match.
391/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
392/// assert_eq!(vec![0, 1], matches);
393///
394/// // Try again, but with text that only matches one of the regexes.
395/// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
396/// assert_eq!(vec![1], matches);
397///
398/// // Try again, but with text that doesn't match any regex in the set.
399/// let matches: Vec<_> = set.matches("example").into_iter().collect();
400/// assert!(matches.is_empty());
401/// ```
402}
403
404define_set! {
405    bytes,
406    set_bytes,
407    &[u8],
408    as_bytes_bytes,
409/// ```rust
410/// # use regex::bytes::RegexSet;
411/// let set = RegexSet::new(&[
412///     r"[a-z]+@[a-z]+\.(com|org|net)",
413///     r"[a-z]+\.(com|org|net)",
414/// ]).unwrap();
415///
416/// // Ask whether any regexes in the set match.
417/// assert!(set.is_match(b"foo@example.com"));
418///
419/// // Identify which regexes in the set match.
420/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
421/// assert_eq!(vec![0, 1], matches);
422///
423/// // Try again, but with text that only matches one of the regexes.
424/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
425/// assert_eq!(vec![1], matches);
426///
427/// // Try again, but with text that doesn't match any regex in the set.
428/// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
429/// assert!(matches.is_empty());
430/// ```
431}