compiler/crates/react_compiler_diagnostics/src/js_string.rs RUST 336 lines View on github.com → Search inside
1//! A JavaScript string value. JS strings are sequences of UTF-16 code units2//! with no validity requirement, so a value can contain unpaired surrogate3//! halves that Rust's `String` cannot represent. `JsString` keeps the common4//! valid case as UTF-8 and falls back to code units only when the value is5//! ill-formed, so the compiler computes on true program values instead of6//! replacement characters or escape hatches.7//!8//! Wire format: the babel bridge transports lone surrogates as9//! `__SURROGATE_XXXX__` markers (see `sanitizeJsonSurrogates` in bridge.ts),10//! because serde_json can neither parse nor emit a lone `\uXXXX` escape.11//! Serde for `JsString` decodes and re-emits that marker form, which keeps the12//! JS side of the bridge unchanged.1314use std::fmt;1516use serde::Deserialize;17use serde::Serialize;1819/// Invariant: `Repr::Utf8` holds every well-formed value and `Repr::Wtf16`20/// only ill-formed ones (at least one unpaired surrogate). The derived21/// `PartialEq`/`Hash` are only sound under this invariant: a well-formed22/// value smuggled into `Wtf16` would compare unequal to its `Utf8` twin. The23/// representation is private so the invariant holds by construction; match on24/// [`JsString::as_ref`] to branch on well-formedness.25#[derive(Debug, Clone, PartialEq, Eq, Hash)]26pub struct JsString(Repr);2728#[derive(Debug, Clone, PartialEq, Eq, Hash)]29enum Repr {30    /// A well-formed string (no unpaired surrogates), stored as UTF-8.31    Utf8(String),32    /// An ill-formed string, stored as UTF-16 code units.33    Wtf16(Vec<u16>),34}3536/// Borrowed view of a [`JsString`] for callers that need to branch on37/// well-formedness.38#[derive(Debug, Clone, Copy, PartialEq, Eq)]39pub enum JsStringRef<'a> {40    Utf8(&'a str),41    Wtf16(&'a [u16]),42}4344impl JsString {45    /// Build from UTF-16 code units, normalizing to UTF-8 when well-formed.46    pub fn from_code_units(units: Vec<u16>) -> Self {47        match String::from_utf16(&units) {48            Ok(s) => JsString(Repr::Utf8(s)),49            Err(_) => JsString(Repr::Wtf16(units)),50        }51    }5253    pub fn as_ref(&self) -> JsStringRef<'_> {54        match &self.0 {55            Repr::Utf8(s) => JsStringRef::Utf8(s),56            Repr::Wtf16(units) => JsStringRef::Wtf16(units),57        }58    }5960    /// The UTF-8 view, when the value is well-formed.61    pub fn as_str(&self) -> Option<&str> {62        match &self.0 {63            Repr::Utf8(s) => Some(s),64            Repr::Wtf16(_) => None,65        }66    }6768    pub fn code_units(&self) -> Vec<u16> {69        match &self.0 {70            Repr::Utf8(s) => s.encode_utf16().collect(),71            Repr::Wtf16(units) => units.clone(),72        }73    }7475    /// Length in UTF-16 code units (JS `String.prototype.length`).76    pub fn len_utf16(&self) -> usize {77        match &self.0 {78            Repr::Utf8(s) => s.encode_utf16().count(),79            Repr::Wtf16(units) => units.len(),80        }81    }8283    /// The value with unpaired surrogates replaced by U+FFFD, for consumers84    /// whose string type cannot represent ill-formed values.85    pub fn to_string_lossy(&self) -> String {86        match &self.0 {87            Repr::Utf8(s) => s.clone(),88            Repr::Wtf16(units) => String::from_utf16_lossy(units),89        }90    }9192    /// Decode the bridge wire form: a UTF-8 string in which lone surrogates93    /// appear as `__SURROGATE_XXXX__` markers (uppercase hex, mirroring what94    /// `sanitizeJsonSurrogates` emits and `restoreJsonSurrogates` accepts).95    ///96    /// All scanning is byte-wise: a marker is 18 ASCII bytes, so byte-slice97    /// comparisons cannot land on a UTF-8 char boundary the way `str` range98    /// indexing can when multibyte text follows the prefix.99    pub fn from_marker_string(s: &str) -> Self {100        const PREFIX: &[u8] = b"__SURROGATE_";101        const MARKER_LEN: usize = 18;102        if !s.contains("__SURROGATE_") {103            return JsString(Repr::Utf8(s.to_string()));104        }105        let bytes = s.as_bytes();106        let mut units: Vec<u16> = Vec::with_capacity(s.len());107        let mut pos = 0;108        let mut segment_start = 0;109        while let Some(found) = s[pos..].find("__SURROGATE_") {110            let idx = pos + found;111            let tail = &bytes[idx..];112            let well_formed = tail.len() >= MARKER_LEN113                && &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"114                && tail[PREFIX.len()..PREFIX.len() + 4]115                    .iter()116                    .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_lowercase());117            if well_formed {118                let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])119                    .expect("ascii hex is valid utf8");120                let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");121                units.extend(s[segment_start..idx].encode_utf16());122                units.push(unit);123                pos = idx + MARKER_LEN;124                segment_start = pos;125            } else {126                // Not a well-formed marker: keep the literal text and continue127                // scanning after the prefix.128                pos = idx + PREFIX.len();129            }130        }131        units.extend(s[segment_start..].encode_utf16());132        JsString::from_code_units(units)133    }134135    /// Encode to the bridge wire form (markers for unpaired surrogates).136    pub fn to_marker_string(&self) -> String {137        match &self.0 {138            Repr::Utf8(s) => s.clone(),139            Repr::Wtf16(units) => {140                let mut out = String::with_capacity(units.len() * 2);141                let mut iter = units.iter().copied().peekable();142                while let Some(unit) = iter.next() {143                    match unit {144                        0xD800..=0xDBFF => {145                            if let Some(&next) = iter.peek() {146                                if (0xDC00..=0xDFFF).contains(&next) {147                                    iter.next();148                                    let cp = 0x10000149                                        + ((unit as u32 - 0xD800) << 10)150                                        + (next as u32 - 0xDC00);151                                    out.push(char::from_u32(cp).expect("valid supplementary"));152                                    continue;153                                }154                            }155                            out.push_str(&format!("__SURROGATE_{unit:04X}__"));156                        }157                        0xDC00..=0xDFFF => {158                            out.push_str(&format!("__SURROGATE_{unit:04X}__"));159                        }160                        _ => {161                            out.push(162                                char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),163                            );164                        }165                    }166                }167                out168            }169        }170    }171172    /// Render as JS-source-style escaped text, matching the form TS's debug173    /// printer produces via JSON.stringify: unpaired surrogates print as174    /// lowercase `\udXXX` escapes inside the otherwise UTF-8 text.175    pub fn to_escaped_string(&self) -> String {176        match &self.0 {177            Repr::Utf8(s) => s.clone(),178            Repr::Wtf16(units) => {179                let mut out = String::with_capacity(units.len() * 2);180                let mut iter = units.iter().copied().peekable();181                while let Some(unit) = iter.next() {182                    match unit {183                        0xD800..=0xDBFF => {184                            if let Some(&next) = iter.peek() {185                                if (0xDC00..=0xDFFF).contains(&next) {186                                    iter.next();187                                    let cp = 0x10000188                                        + ((unit as u32 - 0xD800) << 10)189                                        + (next as u32 - 0xDC00);190                                    out.push(char::from_u32(cp).expect("valid supplementary"));191                                    continue;192                                }193                            }194                            out.push_str(&format!("\\u{unit:04x}"));195                        }196                        0xDC00..=0xDFFF => {197                            out.push_str(&format!("\\u{unit:04x}"));198                        }199                        _ => {200                            out.push(201                                char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),202                            );203                        }204                    }205                }206                out207            }208        }209    }210}211212impl From<String> for JsString {213    fn from(s: String) -> Self {214        // A Rust String is valid UTF-8 and so cannot contain an unpaired215        // surrogate; constructing Utf8 directly preserves the invariant.216        JsString(Repr::Utf8(s))217    }218}219220impl From<&str> for JsString {221    fn from(s: &str) -> Self {222        JsString(Repr::Utf8(s.to_string()))223    }224}225226impl PartialEq<str> for JsString {227    fn eq(&self, other: &str) -> bool {228        self.as_str() == Some(other)229    }230}231232impl PartialEq<&str> for JsString {233    fn eq(&self, other: &&str) -> bool {234        self.as_str() == Some(*other)235    }236}237238impl fmt::Display for JsString {239    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {240        f.write_str(&self.to_escaped_string())241    }242}243244impl Serialize for JsString {245    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {246        serializer.serialize_str(&self.to_marker_string())247    }248}249250impl<'de> Deserialize<'de> for JsString {251    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {252        let s = String::deserialize(deserializer)?;253        Ok(JsString::from_marker_string(&s))254    }255}256257#[cfg(test)]258mod tests {259    use super::JsString;260    use super::JsStringRef;261262    #[test]263    fn as_ref_views_match_well_formedness() {264        assert!(matches!(265            JsString::from("plain").as_ref(),266            JsStringRef::Utf8("plain")267        ));268        assert!(matches!(269            JsString::from_code_units(vec![0xD83E]).as_ref(),270            JsStringRef::Wtf16(&[0xD83E])271        ));272        // Well-formed code units normalize to the Utf8 representation, so273        // equal logical strings are equal values regardless of how they274        // were constructed.275        assert_eq!(276            JsString::from_code_units("plain".encode_utf16().collect()),277            JsString::from("plain")278        );279    }280281    #[test]282    fn marker_round_trip_preserves_lone_surrogates() {283        let js = JsString::from_marker_string("__SURROGATE_D83E__");284        assert_eq!(js.code_units(), vec![0xD83E]);285        assert_eq!(js.to_marker_string(), "__SURROGATE_D83E__");286        assert_eq!(js.to_escaped_string(), "\\ud83e");287    }288289    #[test]290    fn paired_halves_render_as_the_supplementary_character() {291        let js = JsString::from_code_units(vec![0xD83E, 0xDD21]);292        assert_eq!(js.as_str(), Some("\u{1F921}"));293    }294295    #[test]296    fn plain_strings_stay_utf8_and_compare_with_str() {297        let js = JsString::from("use memo");298        assert!(js == "use memo");299        assert_eq!(js.to_marker_string(), "use memo");300    }301302    #[test]303    fn malformed_marker_text_is_kept_literally() {304        let js = JsString::from_marker_string("__SURROGATE_XYZ__");305        assert_eq!(js.as_str(), Some("__SURROGATE_XYZ__"));306    }307308    #[test]309    fn multibyte_text_after_marker_prefix_does_not_panic() {310        let input = "__SURROGATE_\u{20AC}\u{20AC}";311        let js = JsString::from_marker_string(input);312        assert_eq!(js.as_str(), Some(input));313314        let truncated = "__SURROGATE_D8";315        assert_eq!(316            JsString::from_marker_string(truncated).as_str(),317            Some(truncated)318        );319320        let mixed = "a\u{20AC}__SURROGATE_D83E__b\u{20AC}";321        let js = JsString::from_marker_string(mixed);322        let mut expected: Vec<u16> = "a\u{20AC}".encode_utf16().collect();323        expected.push(0xD83E);324        expected.extend("b\u{20AC}".encode_utf16());325        assert_eq!(js.code_units(), expected);326    }327328    #[test]329    fn lowercase_hex_markers_are_not_decoded() {330        // The bridge emits uppercase hex only; lowercase marker-shaped text is331        // user text and must survive verbatim.332        let input = "__SURROGATE_d83e__";333        assert_eq!(JsString::from_marker_string(input).as_str(), Some(input));334    }335}

Code quality findings 20

Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
/// Borrowed view of a [`JsString`] for callers that need to branch on
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
Wtf16(&'a [u16]),
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
while let Some(found) = s[pos..].find("__SURROGATE_") {
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
let tail = &bytes[idx..];
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
&& &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
&& tail[PREFIX.len()..PREFIX.len() + 4]
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
.expect("ascii hex is valid utf8");
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
units.extend(s[segment_start..idx].encode_utf16());
Warning: Direct indexing (e.g., `vec[i]`, `slice[i]`) panics on out-of-bounds access. Prefer using `.get(index)` or `.get_mut(index)` which return Option<&T>/Option<&mut T>.
warning correctness unchecked-indexing
units.extend(s[segment_start..].encode_utf16());
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
out.push(char::from_u32(cp).expect("valid supplementary"));
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
out.push(char::from_u32(cp).expect("valid supplementary"));
Warning: '.expect()' will panic with a custom message on None/Err. While better than unwrap() for debugging, prefer non-panicking error handling in production code (match, if let, ?).
warning correctness expect-usage
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
Performance Info: Calling .to_string() (especially on &str) allocates a new String. If done repeatedly in loops, consider alternatives like working with &str or using crates like `itoa`/`ryu` for number-to-string conversion.
info performance to-string-in-loop
return JsString(Repr::Utf8(s.to_string()));
Performance Info: Frequent cloning, especially of Strings, Vecs, or other heap-allocated types inside loops, can be expensive. Consider using references/borrowing where possible.
info performance clone-in-loop
Repr::Utf8(s) => s.clone(),
Performance Info: Calling .push() repeatedly inside a loop without prior capacity reservation can lead to multiple reallocations. Consider using `Vec::with_capacity(n)` or `vec.reserve(n)` if the approximate number of elements is known.
info performance push-without-reserve
out.push(char::from_u32(cp).expect("valid supplementary"));
Performance Info: Frequent cloning, especially of Strings, Vecs, or other heap-allocated types inside loops, can be expensive. Consider using references/borrowing where possible.
info performance clone-in-loop
Repr::Utf8(s) => s.clone(),
Performance Info: Calling .push() repeatedly inside a loop without prior capacity reservation can lead to multiple reallocations. Consider using `Vec::with_capacity(n)` or `vec.reserve(n)` if the approximate number of elements is known.
info performance push-without-reserve
out.push(char::from_u32(cp).expect("valid supplementary"));

Get this view in your editor

Same data, no extra tab — call code_get_file + code_get_findings over MCP from Claude/Cursor/Copilot.